~berfr/commit-history-data-analysis

52961ab33d317e10dd736f7c3e5c70a038690d45 — berfr 10 months ago
Initial commit
6 files changed, 215 insertions(+), 0 deletions(-)

A .gitignore
A README.md
A analyze_results.ipynb
A get-data.sh
A git-csvlog
A requirements.txt
A  => .gitignore +3 -0
@@ 1,3 @@
.ipynb_checkpoints/
data/
venv/

A  => README.md +20 -0
@@ 1,20 @@
# commit-history-data-analysis

## About

This project contains scripts to aggregate and analyze git commit history. The `get-data.sh` script will scan a base directory for git repositories and will output their log in CSV format in the `data` directory. In the end, it will merge all results into the `data/results.csv` file. To analyze the resulting data, simply open Jupyter Notebook as indicated below and rerun the whole notebook.

## Instructions

```shell
# put `git-csvlog` in a directory in your `$PATH`
ln -s git-csvlog ~/bin/

# run `get-data.sh` with the base directory and author list as a parameters
./get-data.sh ~/code/ "author 1" "author 2"

# run data analysis on data
python3 -m venv venv && . venv/bin/activate
pip install -r requirements.txt
jupyter notebook analyze_results.ipynb
```

A  => analyze_results.ipynb +123 -0
@@ 1,123 @@
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.read_csv('data/results.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df['author date'] = pd.to_datetime(df['author date'], utc=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = df.set_index('author date')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df['year'] = df.index.year\n",
    "df['month'] = df.index.month\n",
    "df['weekday_name'] = df.index.weekday_name"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.groupby(['year']).describe()['month'].plot(kind='bar', y='count')\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.groupby(['month']).describe()['year'].plot(kind='bar', y='count')\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.groupby(['weekday_name']).describe()['year'].plot(kind='bar', y='count')\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.groupby(['year', 'month', 'weekday_name']).describe()['repo'].plot(kind='bar', y='count')\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.groupby(['year', 'month']).count()['repo'].unstack().transpose().fillna(0).plot(kind='bar')\n",
    "plt.show()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}

A  => get-data.sh +42 -0
@@ 1,42 @@
#!/usr/bin/env bash

# usage: ./get-data.sh PATH [AUTHOR ...]

base_dir=$1
results_file=data/results.csv
mkdir -p data

# prepare author filter list
authors=""
for author in "${@:2}"
do
    authors="$authors --author $author"
done

# create result csv file for each repo found
create_repo_result () {
    result_file="data/$(realpath $1 | sed -e 's/^\///g' -e 's/\/.git$//g' -e 's/\//-/g').csv"
    git -C $1 csvlog $2 > $result_file 2> /dev/null
    lines=$(cat $result_file | wc -l)
    echo " -> $result_file -> $(($lines - 1))"
}
export -f create_repo_result
find $base_dir -type d -name ".git" -print0 -exec bash -c 'create_repo_result "$0" "$1"' {} "$authors" \;

# prepare new results file with csv header
rm -f $results_file
header="\"repo\",$(cat $(find data -type f -name *.csv -print -quit) | head -n 1)"
echo $header > $results_file

# merge all repo csv files into results.csv
merge_repo_result () {
    repo_name=$(basename $1 | sed -e 's/\.csv//g')
    sed -e "s/^/\"$repo_name\",/" $1 | tail -n +2 >> $2
}
export -f merge_repo_result
find data -type f -name *.csv ! -name results.csv -exec bash -c 'merge_repo_result "$0" "$1"' {} $results_file \;

# output results filename and line count
echo
lines=$(cat $results_file | wc -l)
echo "$results_file -> $(($lines - 1))"

A  => git-csvlog +24 -0
@@ 1,24 @@
#!/usr/bin/env bash

info="
commit hash: %H
author name: %an
author email: %ae
author date: %ad
committer name: %cn
committer email: %ce
committer date: %cd
ref names: %d
subject: %s
"

# TODO: escape occasional newlines in these values
#   body: %b
#   commit notes: %N

info=$(echo "${info}" | sed '1d;$d')
header=$(echo "${info}" | sed -r 's/: %.*/","/' | tr -d '\n' | sed -e 's/..$//' -e 's/^/"/')
placeholders=$(echo "${info}" | sed -r 's/.*: //' | tr '\n' ',' | sed -e 's/,/","/g' -e 's/..$//' -e 's/^/"/')

echo $header
git --no-pager log --pretty=tformat:"$placeholders" "$@"

A  => requirements.txt +3 -0
@@ 1,3 @@
jupyter
matplotlib
pandas