Notebook Example¶
Please note that:
- This notebook pulls from public repositories in github.com/pypa, which is an open-source project (using private/proprietary repos would be unethical, and illegal in most jurisdictions)
- The (typically) sporadic nature of open-source project contributions aren't conducive to analyzing developer productivity, since for most authors these projects are not a full-time job
- ...so no meaning should be inferred from the below plots, this notebook is
intended to demonstrate use of
git-author-stats
with the provided template only
Extract Stats¶
In [ ]:
Copied!
from __future__ import annotations
import os
from datetime import date, timedelta
from pathlib import Path
import polars
from git_author_stats import iter_stats, read_stats, write_stats
TODAY: date = date.today()
WEEK_START: date = TODAY - timedelta(days=TODAY.weekday())
HISTORY_WEEKS: int = 56
STALE_WEEKS: int = 8
# Because local commits, and commits which were in non-default branches, will
# not have been incorporated into our stats when this notebook was last run,
# we'll start looking for stats the *Monday* preceding `STALE_WEEKS` weeks ago
# today, in order to pick up stats from any commits which were subsequently
# merged. The date from which we'll retrieve stats, then, will either be the
# Monday preceding our last extracted stats, or the "tail" date, whichever is
# earlier. We will also strip previously retrieved stats for this time period
# from our previously extracted data, so that we don't double-count.
TAIL_START: date = WEEK_START - timedelta(
days=STALE_WEEKS * 7
)
DATA_PATH: Path = Path("data").absolute()
os.makedirs(DATA_PATH, exist_ok=True)
def cut_off_tail() -> date:
"""
Cut off the "tail" of our previously extracted stats, if needed, and return
the date from which we should start looking for stats to extract.
"""
since: date = date.today() - timedelta(
days=HISTORY_WEEKS * 7
)
# ...make it a Monday
since -= timedelta(days=since.weekday())
latest_path: Path | None = None
path: Path
for path in filter(Path.is_file, DATA_PATH.iterdir()):
name: str
extension: str
name, extension = path.name.rpartition(".")[::2]
if extension.lower() == "csv":
file_date: date = date.fromisoformat(name)
if file_date >= TAIL_START:
path.unlink()
elif (file_date >= since) or (latest_path is None):
since = file_date
latest_path = path
if latest_path is not None:
# Strip tail stats from our latest CSV file
polars.LazyFrame(read_stats(latest_path)).filter(
polars.col("before") < TAIL_START
).collect().write_csv(latest_path)
# Get the first date not covered by our CSVs, now that we've
# cut off their tail
since = date.fromisoformat(
polars.read_csv(latest_path).select(polars.max("before")).item()
)
return since
def update_stats() -> None:
"""
Extract stats to a CSV in the data directory as needed to bring us
up-to-date
"""
since: date = cut_off_tail()
# Lookup stats, starting the day after the last one covered by our last
# previously retrieved records, and ending yesterday
write_stats(
iter_stats(
"github.com/pypa",
password=os.environ.get(
"",
""
),
since=since,
# Don't include this week, since it isn't over yet
before=WEEK_START,
frequency="1w",
),
f"data/{since.isoformat()}.csv",
)
update_stats()
from __future__ import annotations
import os
from datetime import date, timedelta
from pathlib import Path
import polars
from git_author_stats import iter_stats, read_stats, write_stats
TODAY: date = date.today()
WEEK_START: date = TODAY - timedelta(days=TODAY.weekday())
HISTORY_WEEKS: int = 56
STALE_WEEKS: int = 8
# Because local commits, and commits which were in non-default branches, will
# not have been incorporated into our stats when this notebook was last run,
# we'll start looking for stats the *Monday* preceding `STALE_WEEKS` weeks ago
# today, in order to pick up stats from any commits which were subsequently
# merged. The date from which we'll retrieve stats, then, will either be the
# Monday preceding our last extracted stats, or the "tail" date, whichever is
# earlier. We will also strip previously retrieved stats for this time period
# from our previously extracted data, so that we don't double-count.
TAIL_START: date = WEEK_START - timedelta(
days=STALE_WEEKS * 7
)
DATA_PATH: Path = Path("data").absolute()
os.makedirs(DATA_PATH, exist_ok=True)
def cut_off_tail() -> date:
"""
Cut off the "tail" of our previously extracted stats, if needed, and return
the date from which we should start looking for stats to extract.
"""
since: date = date.today() - timedelta(
days=HISTORY_WEEKS * 7
)
# ...make it a Monday
since -= timedelta(days=since.weekday())
latest_path: Path | None = None
path: Path
for path in filter(Path.is_file, DATA_PATH.iterdir()):
name: str
extension: str
name, extension = path.name.rpartition(".")[::2]
if extension.lower() == "csv":
file_date: date = date.fromisoformat(name)
if file_date >= TAIL_START:
path.unlink()
elif (file_date >= since) or (latest_path is None):
since = file_date
latest_path = path
if latest_path is not None:
# Strip tail stats from our latest CSV file
polars.LazyFrame(read_stats(latest_path)).filter(
polars.col("before") < TAIL_START
).collect().write_csv(latest_path)
# Get the first date not covered by our CSVs, now that we've
# cut off their tail
since = date.fromisoformat(
polars.read_csv(latest_path).select(polars.max("before")).item()
)
return since
def update_stats() -> None:
"""
Extract stats to a CSV in the data directory as needed to bring us
up-to-date
"""
since: date = cut_off_tail()
# Lookup stats, starting the day after the last one covered by our last
# previously retrieved records, and ending yesterday
write_stats(
iter_stats(
"github.com/pypa",
password=os.environ.get(
"",
""
),
since=since,
# Don't include this week, since it isn't over yet
before=WEEK_START,
frequency="1w",
),
f"data/{since.isoformat()}.csv",
)
update_stats()
Plot Stats¶
In [102]:
Copied!
from collections.abc import Sequence
from datetime import date, timedelta
from functools import lru_cache
from operator import itemgetter
import matplotlib.axes
import matplotlib.figure
import matplotlib.pyplot
import matplotlib.ticker
LOCATOR: matplotlib.ticker.MultipleLocator = matplotlib.ticker.MultipleLocator(
7.0
)
def scan_stats(
number_of_weeks: int = HISTORY_WEEKS,
include_author_names: tuple[str, ...] | str = (),
exclude_author_names: tuple[str, ...] | str = (),
) -> polars.LazyFrame:
if isinstance(include_author_names, str):
include_author_names = (include_author_names,)
if isinstance(exclude_author_names, str):
exclude_author_names = (exclude_author_names,)
stats: polars.LazyFrame = (
polars.scan_csv("data/*.csv")
# TODO: Modify the following filter to exclude commits/files
# which you don't want to have contribute to your stats.
.filter(
# Config, JSON, requirement files, and the like are often
# modified by project management tools
(
polars.col("file").str.contains(
r"(?i)\.(txt|ini|cfg|toml|yaml|yml|json)$"
)
# TODO: Modify the following to account for your service IDs
| polars.col("author_name").str.contains(
r"(?i)(\[bot\]$|github-actions|dependabot)",
)
).not_()
)
.select(
(
"url",
(
polars.col("author_name")
.str.to_titlecase()
# TODO: Normalize author names here, if needed
# .replace(
# "Anomalous Name Variation",
# "Normalized Name",
# )
),
polars.col("since").str.to_date(),
polars.col("before").str.to_date(),
"insertions",
"deletions",
"commit",
)
)
.filter(
polars.col("since") > date.today() - timedelta(
days=number_of_weeks * 7
)
)
)
if include_author_names:
stats = stats.filter(
polars.col("author_name").is_in(include_author_names)
)
if exclude_author_names:
stats = stats.filter(
polars.col("author_name").is_in(exclude_author_names).not_()
)
return stats
def scan_weekly_stats(
number_of_weeks: int = HISTORY_WEEKS,
include_author_names: tuple[str, ...] | str = (),
exclude_author_names: tuple[str, ...] | str = (),
) -> polars.LazyFrame:
return (
scan_stats(number_of_weeks, include_author_names, exclude_author_names)
.group_by("url", "author_name", "since", "before", "commit")
.sum()
.select(("author_name", "since", "before", "insertions", "deletions"))
.group_by("author_name", "since", "before")
.sum()
.sort(("since", "author_name"))
)
def get_author_stats(
number_of_weeks: int = HISTORY_WEEKS,
include_author_names: tuple[str, ...] | str = (),
exclude_author_names: tuple[str, ...] | str = (),
author_minimum_weeks: int = STALE_WEEKS,
) -> polars.DataFrame:
"""
Get a data frame with average weekly insertions/deletions for all
authors who have contributions spanning over at least the specified number
of weeks.
"""
authors_lazy_frame: polars.LazyFrame = scan_authors(
number_of_weeks=number_of_weeks,
include_author_names=include_author_names,
exclude_author_names=exclude_author_names,
author_minimum_weeks=author_minimum_weeks,
)
authors_data_frame: polars.DataFrame = (
scan_weekly_stats(
number_of_weeks,
include_author_names,
exclude_author_names,
)
.join(
authors_lazy_frame,
on="author_name",
)
.select(
(
"author_name",
"since",
"before",
"insertions",
"deletions",
)
)
.group_by(
"author_name"
).agg(
since=polars.col("since").min(),
before=polars.col("before").max(),
insertions=polars.sum("insertions"),
deletions=polars.sum("deletions"),
active_weeks=polars.col("since").count()
).collect()
)
# Get the number of weeks each author has been active.
# Here we use a start and end date rather than a count of weeks
# with active contributions, since a contribution may be worked on
# over the course of several weeks. So long as employment is mostly
# continuous, this will provide meaningful metrics. If the evaluated
# time period includes at least a year's history, and employees
# have a similar number of vacation days/weeks, vacation gaps won't
# skew our stats too much either. Longer employment gaps, such as for
# sabbaticals and/or laid-off/re-hired authors, however, will
# meaningfully degrade that author's stats.
# TODO: Account for long employment gaps, if needed.
authors_data_frame = authors_data_frame.with_columns(
weeks=(
(polars.col("before") - polars.col("since")).dt.total_days() / 7
).cast(int)
).with_columns(
weekly_insertions=(
polars.col("insertions") / polars.col("weeks")
).cast(int),
weekly_deletions=(polars.col("deletions") / polars.col("weeks")).cast(
int
),
)
return authors_data_frame.sort("author_name", descending=True)
def get_weekly_stats(
number_of_weeks: int = HISTORY_WEEKS,
include_author_names: tuple[str, ...] | str = (),
exclude_author_names: tuple[str, ...] | str = (),
author_minimum_weeks: int = STALE_WEEKS,
) -> polars.DataFrame:
"""
Get a data frame with insertions/deletions per week, by author.
"""
return (
scan_weekly_stats(
number_of_weeks, include_author_names, exclude_author_names
).join(
scan_authors(
number_of_weeks=number_of_weeks,
include_author_names=include_author_names,
exclude_author_names=exclude_author_names,
author_minimum_weeks=author_minimum_weeks,
),
on="author_name"
)
.group_by("since", "author_name")
.sum()
.sort("since")
.select(
(
polars.col("since").alias("Week"),
polars.col("author_name").alias("Author"),
polars.col("insertions").alias("Insertions"),
polars.col("deletions").alias("Deletions"),
)
)
).collect()
def get_sum_weekly_stats(
number_of_weeks: int = HISTORY_WEEKS,
include_author_names: tuple[str, ...] | str = (),
exclude_author_names: tuple[str, ...] | str = (),
author_minimum_weeks: int = STALE_WEEKS,
) -> polars.DataFrame:
"""
Get a data frame with total insertions/deletions per week, for all authors.
"""
return (
scan_weekly_stats(
number_of_weeks, include_author_names, exclude_author_names
).join(
scan_authors(
number_of_weeks=number_of_weeks,
include_author_names=include_author_names,
exclude_author_names=exclude_author_names,
author_minimum_weeks=author_minimum_weeks,
),
on="author_name"
)
.group_by("since")
.sum()
.sort("since")
.select(
(
polars.col("since").alias("Week"),
polars.col("insertions").alias("Insertions"),
polars.col("deletions").alias("Deletions"),
)
)
).collect()
def get_author_label(
include_author_names: tuple[str, ...] | str = (),
exclude_author_names: tuple[str, ...] | str = (),
) -> str:
"""
Get the label to use based on included/excluded author names
"""
if isinstance(include_author_names, str):
include_author_names = (include_author_names,)
if isinstance(exclude_author_names, str):
exclude_author_names = (exclude_author_names,)
label: list[str] = []
author_names: tuple[str, ...] = ()
if exclude_author_names:
if include_author_names:
exclude_author_names_set: frozenset[str] = frozenset(
exclude_author_names
)
author_names = tuple(
author_name
for author_name in include_author_names
if author_name not in exclude_author_names_set
)
else:
label.append("All Active Authors Except")
author_names = exclude_author_names
else:
if include_author_names:
author_names = include_author_names
else:
label.append("All Active Authors")
if author_names:
if len(author_names) == 1:
label += (author_names[0],)
else:
label += "{} and {}".format(
", ".join(author_names[:-1]),
author_names[-1],
)
return " ".join(label)
def plot_weekly_sum(
number_of_weeks: int = 32,
include_author_names: tuple[str, ...] | str = (),
exclude_author_names: tuple[str, ...] | str = (),
author_minimum_weeks: int = STALE_WEEKS,
) -> matplotlib.axes.Axes:
author_label = get_author_label(include_author_names, exclude_author_names)
axes: matplotlib.axes.Axes = (
get_sum_weekly_stats(
number_of_weeks,
include_author_names,
exclude_author_names,
author_minimum_weeks=author_minimum_weeks
)
.to_pandas()
.plot.area(
title=(
f"{author_label} - Weekly Total - "
f"Past {number_of_weeks} Weeks"
),
x="Week",
y=["Insertions", "Deletions"],
figsize=(32, 4),
rot=90,
xlabel="Week",
ylabel="Insertions + Deletions",
x_compat=True,
)
)
axes.xaxis.set_major_locator(LOCATOR)
axes.xaxis.set_ticks_position("none")
return axes
def get_mean_weekly_stats(
number_of_weeks: int = HISTORY_WEEKS,
include_author_names: tuple[str, ...] | str = (),
exclude_author_names: tuple[str, ...] | str = (),
author_minimum_weeks: int = STALE_WEEKS,
) -> polars.DataFrame:
return (
scan_weekly_stats(
number_of_weeks,
include_author_names,
exclude_author_names,
).join(
scan_authors(
number_of_weeks=number_of_weeks,
include_author_names=include_author_names,
exclude_author_names=exclude_author_names,
author_minimum_weeks=author_minimum_weeks,
),
on="author_name"
)
.group_by("since")
.mean()
.sort("since")
.select(
(
polars.col("since").alias("Week"),
polars.col("insertions").alias("Insertions"),
polars.col("deletions").alias("Deletions"),
)
)
.collect()
)
def plot_weekly_mean(
number_of_weeks: int = HISTORY_WEEKS,
include_author_names: tuple[str, ...] | str = (),
exclude_author_names: tuple[str, ...] | str = (),
author_minimum_weeks: int = STALE_WEEKS,
) -> matplotlib.axes.Axes:
author_label = get_author_label(include_author_names, exclude_author_names)
axes: matplotlib.axes.Axes = (
get_mean_weekly_stats(
number_of_weeks,
include_author_names,
exclude_author_names,
author_minimum_weeks=author_minimum_weeks,
)
.to_pandas()
.plot.area(
title=(
f"{author_label} - Weekly per/Author Average - "
f"Past {number_of_weeks} Weeks"
),
x="Week",
y=["Insertions", "Deletions"],
figsize=(24, 6),
rot=90,
xlabel="Week",
ylabel="Insertions",
x_compat=True,
)
)
axes.xaxis.set_major_locator(LOCATOR)
axes.xaxis.set_ticks_position("none")
return axes
def scan_authors(
number_of_weeks: int = HISTORY_WEEKS,
include_author_names: tuple[str, ...] | str = (),
exclude_author_names: tuple[str, ...] | str = (),
author_minimum_weeks: int = STALE_WEEKS,
) -> polars.LazyFrame:
"""
Get a lazy frame with the names of all authors who have contributed
over at least the specified number of weeks.
"""
return (
scan_weekly_stats(
number_of_weeks, include_author_names, exclude_author_names
)
.group_by("author_name", "since")
.agg()
.group_by("author_name")
.agg(
weeks=polars.col("since").count()
)
.filter(
polars.col("weeks") >= author_minimum_weeks
)
# We only need the author names, so we can drop the rest of the
.select(
polars.col("author_name")
)
)
@lru_cache
def count_authors(
number_of_weeks: int = HISTORY_WEEKS,
include_author_names: tuple[str, ...] | str = (),
exclude_author_names: tuple[str, ...] | str = (),
author_minimum_weeks: int = STALE_WEEKS,
) -> int:
"""
Return the number of authors
"""
return scan_authors(
number_of_weeks,
include_author_names=include_author_names,
exclude_author_names=exclude_author_names,
author_minimum_weeks=author_minimum_weeks,
).collect().height
def plot_author_weekly_insertions(
number_of_weeks: int = HISTORY_WEEKS,
include_author_names: tuple[str, ...] | str = (),
exclude_author_names: tuple[str, ...] | str = (),
author_minimum_weeks: int = STALE_WEEKS,
) -> Sequence[matplotlib.axes.Axes]:
"""
Plot weekly insertions for all authors over the past `number_of_weeks`
"""
author_label: str = get_author_label(
include_author_names,
exclude_author_names,
)
axes: matplotlib.axes.Axes
number_of_authors: int = count_authors(
number_of_weeks,
include_author_names,
exclude_author_names,
author_minimum_weeks=author_minimum_weeks
)
authors_axes: Sequence[matplotlib.axes.Axes] = (
get_weekly_stats(
number_of_weeks,
include_author_names,
exclude_author_names,
author_minimum_weeks,
)
.to_pandas()
.pivot(index="Week", columns="Author", values="Insertions")
.plot.area(
title=(
f"Weekly Insertions - Past {number_of_weeks} Weeks - "
f"{author_label}"
),
figsize=(
24,
number_of_authors
),
rot=90,
x_compat=True,
subplots=True,
)
)
# Get the minimum and maximum y-bounds for all axes, in order to display
# all data at the same scale
ybounds: tuple[tuple[float, float], ...] = tuple(
map(matplotlib.axes.Axes.get_ybound, authors_axes)
)
ybound: tuple[float, float] = (
max(min(map(itemgetter(0), ybounds)), 0),
max(map(itemgetter(1), ybounds)),
)
# Set the y-bounds for all axes to the same values
for axes in authors_axes:
axes.set_ybound(*ybound)
axes.xaxis.set_ticks_position("none")
axes.autoscale(enable=True, axis="y", tight=True)
authors_axes[-1].xaxis.set_major_locator(LOCATOR)
return authors_axes
def plot_author_means(
number_of_weeks: int = HISTORY_WEEKS,
include_author_names: tuple[str, ...] | str = (),
exclude_author_names: tuple[str, ...] | str = (),
author_minimum_weeks: int = STALE_WEEKS,
) -> tuple[
matplotlib.figure.Figure,
matplotlib.axes.Axes
]:
"""
Plot the weekly average insertions and deletions for all authors over
the past `numbers_of_weeks`.
"""
author_label: str = get_author_label(
include_author_names,
exclude_author_names,
)
number_of_authors: int = count_authors(
number_of_weeks,
include_author_names,
exclude_author_names,
author_minimum_weeks=author_minimum_weeks
)
return get_author_stats(
number_of_weeks=number_of_weeks,
include_author_names=include_author_names,
exclude_author_names=exclude_author_names,
author_minimum_weeks=author_minimum_weeks,
).select(
polars.col("author_name").alias("Author"),
polars.col("weekly_insertions").alias("Weekly Insertions"),
polars.col("weekly_deletions").alias("Weekly Deletions"),
).to_pandas().plot.barh(
x="Author",
y=["Weekly Insertions", "Weekly Deletions"],
stacked=True,
title=(
f"Past {number_of_weeks} Weeks: "
f"Weekly Insertions and Deletions - {author_label}"
),
figsize=(24, number_of_authors / 4),
)
plot_author_means()
plot_weekly_mean()
plot_weekly_sum()
plot_author_weekly_insertions()
matplotlib.pyplot.show()
from collections.abc import Sequence
from datetime import date, timedelta
from functools import lru_cache
from operator import itemgetter
import matplotlib.axes
import matplotlib.figure
import matplotlib.pyplot
import matplotlib.ticker
LOCATOR: matplotlib.ticker.MultipleLocator = matplotlib.ticker.MultipleLocator(
7.0
)
def scan_stats(
number_of_weeks: int = HISTORY_WEEKS,
include_author_names: tuple[str, ...] | str = (),
exclude_author_names: tuple[str, ...] | str = (),
) -> polars.LazyFrame:
if isinstance(include_author_names, str):
include_author_names = (include_author_names,)
if isinstance(exclude_author_names, str):
exclude_author_names = (exclude_author_names,)
stats: polars.LazyFrame = (
polars.scan_csv("data/*.csv")
# TODO: Modify the following filter to exclude commits/files
# which you don't want to have contribute to your stats.
.filter(
# Config, JSON, requirement files, and the like are often
# modified by project management tools
(
polars.col("file").str.contains(
r"(?i)\.(txt|ini|cfg|toml|yaml|yml|json)$"
)
# TODO: Modify the following to account for your service IDs
| polars.col("author_name").str.contains(
r"(?i)(\[bot\]$|github-actions|dependabot)",
)
).not_()
)
.select(
(
"url",
(
polars.col("author_name")
.str.to_titlecase()
# TODO: Normalize author names here, if needed
# .replace(
# "Anomalous Name Variation",
# "Normalized Name",
# )
),
polars.col("since").str.to_date(),
polars.col("before").str.to_date(),
"insertions",
"deletions",
"commit",
)
)
.filter(
polars.col("since") > date.today() - timedelta(
days=number_of_weeks * 7
)
)
)
if include_author_names:
stats = stats.filter(
polars.col("author_name").is_in(include_author_names)
)
if exclude_author_names:
stats = stats.filter(
polars.col("author_name").is_in(exclude_author_names).not_()
)
return stats
def scan_weekly_stats(
number_of_weeks: int = HISTORY_WEEKS,
include_author_names: tuple[str, ...] | str = (),
exclude_author_names: tuple[str, ...] | str = (),
) -> polars.LazyFrame:
return (
scan_stats(number_of_weeks, include_author_names, exclude_author_names)
.group_by("url", "author_name", "since", "before", "commit")
.sum()
.select(("author_name", "since", "before", "insertions", "deletions"))
.group_by("author_name", "since", "before")
.sum()
.sort(("since", "author_name"))
)
def get_author_stats(
number_of_weeks: int = HISTORY_WEEKS,
include_author_names: tuple[str, ...] | str = (),
exclude_author_names: tuple[str, ...] | str = (),
author_minimum_weeks: int = STALE_WEEKS,
) -> polars.DataFrame:
"""
Get a data frame with average weekly insertions/deletions for all
authors who have contributions spanning over at least the specified number
of weeks.
"""
authors_lazy_frame: polars.LazyFrame = scan_authors(
number_of_weeks=number_of_weeks,
include_author_names=include_author_names,
exclude_author_names=exclude_author_names,
author_minimum_weeks=author_minimum_weeks,
)
authors_data_frame: polars.DataFrame = (
scan_weekly_stats(
number_of_weeks,
include_author_names,
exclude_author_names,
)
.join(
authors_lazy_frame,
on="author_name",
)
.select(
(
"author_name",
"since",
"before",
"insertions",
"deletions",
)
)
.group_by(
"author_name"
).agg(
since=polars.col("since").min(),
before=polars.col("before").max(),
insertions=polars.sum("insertions"),
deletions=polars.sum("deletions"),
active_weeks=polars.col("since").count()
).collect()
)
# Get the number of weeks each author has been active.
# Here we use a start and end date rather than a count of weeks
# with active contributions, since a contribution may be worked on
# over the course of several weeks. So long as employment is mostly
# continuous, this will provide meaningful metrics. If the evaluated
# time period includes at least a year's history, and employees
# have a similar number of vacation days/weeks, vacation gaps won't
# skew our stats too much either. Longer employment gaps, such as for
# sabbaticals and/or laid-off/re-hired authors, however, will
# meaningfully degrade that author's stats.
# TODO: Account for long employment gaps, if needed.
authors_data_frame = authors_data_frame.with_columns(
weeks=(
(polars.col("before") - polars.col("since")).dt.total_days() / 7
).cast(int)
).with_columns(
weekly_insertions=(
polars.col("insertions") / polars.col("weeks")
).cast(int),
weekly_deletions=(polars.col("deletions") / polars.col("weeks")).cast(
int
),
)
return authors_data_frame.sort("author_name", descending=True)
def get_weekly_stats(
number_of_weeks: int = HISTORY_WEEKS,
include_author_names: tuple[str, ...] | str = (),
exclude_author_names: tuple[str, ...] | str = (),
author_minimum_weeks: int = STALE_WEEKS,
) -> polars.DataFrame:
"""
Get a data frame with insertions/deletions per week, by author.
"""
return (
scan_weekly_stats(
number_of_weeks, include_author_names, exclude_author_names
).join(
scan_authors(
number_of_weeks=number_of_weeks,
include_author_names=include_author_names,
exclude_author_names=exclude_author_names,
author_minimum_weeks=author_minimum_weeks,
),
on="author_name"
)
.group_by("since", "author_name")
.sum()
.sort("since")
.select(
(
polars.col("since").alias("Week"),
polars.col("author_name").alias("Author"),
polars.col("insertions").alias("Insertions"),
polars.col("deletions").alias("Deletions"),
)
)
).collect()
def get_sum_weekly_stats(
number_of_weeks: int = HISTORY_WEEKS,
include_author_names: tuple[str, ...] | str = (),
exclude_author_names: tuple[str, ...] | str = (),
author_minimum_weeks: int = STALE_WEEKS,
) -> polars.DataFrame:
"""
Get a data frame with total insertions/deletions per week, for all authors.
"""
return (
scan_weekly_stats(
number_of_weeks, include_author_names, exclude_author_names
).join(
scan_authors(
number_of_weeks=number_of_weeks,
include_author_names=include_author_names,
exclude_author_names=exclude_author_names,
author_minimum_weeks=author_minimum_weeks,
),
on="author_name"
)
.group_by("since")
.sum()
.sort("since")
.select(
(
polars.col("since").alias("Week"),
polars.col("insertions").alias("Insertions"),
polars.col("deletions").alias("Deletions"),
)
)
).collect()
def get_author_label(
include_author_names: tuple[str, ...] | str = (),
exclude_author_names: tuple[str, ...] | str = (),
) -> str:
"""
Get the label to use based on included/excluded author names
"""
if isinstance(include_author_names, str):
include_author_names = (include_author_names,)
if isinstance(exclude_author_names, str):
exclude_author_names = (exclude_author_names,)
label: list[str] = []
author_names: tuple[str, ...] = ()
if exclude_author_names:
if include_author_names:
exclude_author_names_set: frozenset[str] = frozenset(
exclude_author_names
)
author_names = tuple(
author_name
for author_name in include_author_names
if author_name not in exclude_author_names_set
)
else:
label.append("All Active Authors Except")
author_names = exclude_author_names
else:
if include_author_names:
author_names = include_author_names
else:
label.append("All Active Authors")
if author_names:
if len(author_names) == 1:
label += (author_names[0],)
else:
label += "{} and {}".format(
", ".join(author_names[:-1]),
author_names[-1],
)
return " ".join(label)
def plot_weekly_sum(
number_of_weeks: int = 32,
include_author_names: tuple[str, ...] | str = (),
exclude_author_names: tuple[str, ...] | str = (),
author_minimum_weeks: int = STALE_WEEKS,
) -> matplotlib.axes.Axes:
author_label = get_author_label(include_author_names, exclude_author_names)
axes: matplotlib.axes.Axes = (
get_sum_weekly_stats(
number_of_weeks,
include_author_names,
exclude_author_names,
author_minimum_weeks=author_minimum_weeks
)
.to_pandas()
.plot.area(
title=(
f"{author_label} - Weekly Total - "
f"Past {number_of_weeks} Weeks"
),
x="Week",
y=["Insertions", "Deletions"],
figsize=(32, 4),
rot=90,
xlabel="Week",
ylabel="Insertions + Deletions",
x_compat=True,
)
)
axes.xaxis.set_major_locator(LOCATOR)
axes.xaxis.set_ticks_position("none")
return axes
def get_mean_weekly_stats(
number_of_weeks: int = HISTORY_WEEKS,
include_author_names: tuple[str, ...] | str = (),
exclude_author_names: tuple[str, ...] | str = (),
author_minimum_weeks: int = STALE_WEEKS,
) -> polars.DataFrame:
return (
scan_weekly_stats(
number_of_weeks,
include_author_names,
exclude_author_names,
).join(
scan_authors(
number_of_weeks=number_of_weeks,
include_author_names=include_author_names,
exclude_author_names=exclude_author_names,
author_minimum_weeks=author_minimum_weeks,
),
on="author_name"
)
.group_by("since")
.mean()
.sort("since")
.select(
(
polars.col("since").alias("Week"),
polars.col("insertions").alias("Insertions"),
polars.col("deletions").alias("Deletions"),
)
)
.collect()
)
def plot_weekly_mean(
number_of_weeks: int = HISTORY_WEEKS,
include_author_names: tuple[str, ...] | str = (),
exclude_author_names: tuple[str, ...] | str = (),
author_minimum_weeks: int = STALE_WEEKS,
) -> matplotlib.axes.Axes:
author_label = get_author_label(include_author_names, exclude_author_names)
axes: matplotlib.axes.Axes = (
get_mean_weekly_stats(
number_of_weeks,
include_author_names,
exclude_author_names,
author_minimum_weeks=author_minimum_weeks,
)
.to_pandas()
.plot.area(
title=(
f"{author_label} - Weekly per/Author Average - "
f"Past {number_of_weeks} Weeks"
),
x="Week",
y=["Insertions", "Deletions"],
figsize=(24, 6),
rot=90,
xlabel="Week",
ylabel="Insertions",
x_compat=True,
)
)
axes.xaxis.set_major_locator(LOCATOR)
axes.xaxis.set_ticks_position("none")
return axes
def scan_authors(
number_of_weeks: int = HISTORY_WEEKS,
include_author_names: tuple[str, ...] | str = (),
exclude_author_names: tuple[str, ...] | str = (),
author_minimum_weeks: int = STALE_WEEKS,
) -> polars.LazyFrame:
"""
Get a lazy frame with the names of all authors who have contributed
over at least the specified number of weeks.
"""
return (
scan_weekly_stats(
number_of_weeks, include_author_names, exclude_author_names
)
.group_by("author_name", "since")
.agg()
.group_by("author_name")
.agg(
weeks=polars.col("since").count()
)
.filter(
polars.col("weeks") >= author_minimum_weeks
)
# We only need the author names, so we can drop the rest of the
.select(
polars.col("author_name")
)
)
@lru_cache
def count_authors(
number_of_weeks: int = HISTORY_WEEKS,
include_author_names: tuple[str, ...] | str = (),
exclude_author_names: tuple[str, ...] | str = (),
author_minimum_weeks: int = STALE_WEEKS,
) -> int:
"""
Return the number of authors
"""
return scan_authors(
number_of_weeks,
include_author_names=include_author_names,
exclude_author_names=exclude_author_names,
author_minimum_weeks=author_minimum_weeks,
).collect().height
def plot_author_weekly_insertions(
number_of_weeks: int = HISTORY_WEEKS,
include_author_names: tuple[str, ...] | str = (),
exclude_author_names: tuple[str, ...] | str = (),
author_minimum_weeks: int = STALE_WEEKS,
) -> Sequence[matplotlib.axes.Axes]:
"""
Plot weekly insertions for all authors over the past `number_of_weeks`
"""
author_label: str = get_author_label(
include_author_names,
exclude_author_names,
)
axes: matplotlib.axes.Axes
number_of_authors: int = count_authors(
number_of_weeks,
include_author_names,
exclude_author_names,
author_minimum_weeks=author_minimum_weeks
)
authors_axes: Sequence[matplotlib.axes.Axes] = (
get_weekly_stats(
number_of_weeks,
include_author_names,
exclude_author_names,
author_minimum_weeks,
)
.to_pandas()
.pivot(index="Week", columns="Author", values="Insertions")
.plot.area(
title=(
f"Weekly Insertions - Past {number_of_weeks} Weeks - "
f"{author_label}"
),
figsize=(
24,
number_of_authors
),
rot=90,
x_compat=True,
subplots=True,
)
)
# Get the minimum and maximum y-bounds for all axes, in order to display
# all data at the same scale
ybounds: tuple[tuple[float, float], ...] = tuple(
map(matplotlib.axes.Axes.get_ybound, authors_axes)
)
ybound: tuple[float, float] = (
max(min(map(itemgetter(0), ybounds)), 0),
max(map(itemgetter(1), ybounds)),
)
# Set the y-bounds for all axes to the same values
for axes in authors_axes:
axes.set_ybound(*ybound)
axes.xaxis.set_ticks_position("none")
axes.autoscale(enable=True, axis="y", tight=True)
authors_axes[-1].xaxis.set_major_locator(LOCATOR)
return authors_axes
def plot_author_means(
number_of_weeks: int = HISTORY_WEEKS,
include_author_names: tuple[str, ...] | str = (),
exclude_author_names: tuple[str, ...] | str = (),
author_minimum_weeks: int = STALE_WEEKS,
) -> tuple[
matplotlib.figure.Figure,
matplotlib.axes.Axes
]:
"""
Plot the weekly average insertions and deletions for all authors over
the past `numbers_of_weeks`.
"""
author_label: str = get_author_label(
include_author_names,
exclude_author_names,
)
number_of_authors: int = count_authors(
number_of_weeks,
include_author_names,
exclude_author_names,
author_minimum_weeks=author_minimum_weeks
)
return get_author_stats(
number_of_weeks=number_of_weeks,
include_author_names=include_author_names,
exclude_author_names=exclude_author_names,
author_minimum_weeks=author_minimum_weeks,
).select(
polars.col("author_name").alias("Author"),
polars.col("weekly_insertions").alias("Weekly Insertions"),
polars.col("weekly_deletions").alias("Weekly Deletions"),
).to_pandas().plot.barh(
x="Author",
y=["Weekly Insertions", "Weekly Deletions"],
stacked=True,
title=(
f"Past {number_of_weeks} Weeks: "
f"Weekly Insertions and Deletions - {author_label}"
),
figsize=(24, number_of_authors / 4),
)
plot_author_means()
plot_weekly_mean()
plot_weekly_sum()
plot_author_weekly_insertions()
matplotlib.pyplot.show()
In [ ]:
Copied!