Notebook Example¶

Please note that:

This notebook pulls from public repositories in github.com/pypa, which is an open-source project (using private/proprietary repos would be unethical, and illegal in most jurisdictions)
The (typically) sporadic nature of open-source project contributions aren't conducive to analyzing developer productivity, since for most authors these projects are not a full-time job
...so no meaning should be inferred from the below plots, this notebook is intended to demonstrate use of git-author-stats with the provided template only

Extract Stats¶

In [ ]:

Copied!





from __future__ import annotations

import os
from datetime import date, timedelta
from pathlib import Path

import polars
from git_author_stats import iter_stats, read_stats, write_stats

TODAY: date = date.today()
WEEK_START: date = TODAY - timedelta(days=TODAY.weekday())
HISTORY_WEEKS: int = 56
STALE_WEEKS: int = 8
# Because local commits, and commits which were in non-default branches, will
# not have been incorporated into our stats when this notebook was last run,
# we'll start looking for stats the *Monday* preceding `STALE_WEEKS` weeks ago
# today, in order to pick up stats from any commits which were subsequently
# merged. The date from which we'll retrieve stats, then, will either be the
# Monday preceding our last extracted stats, or the "tail" date, whichever is
# earlier. We will also strip previously retrieved stats for this time period
# from our previously extracted data, so that we don't double-count.
TAIL_START: date = WEEK_START - timedelta(
    days=STALE_WEEKS * 7
)
DATA_PATH: Path = Path("data").absolute()
os.makedirs(DATA_PATH, exist_ok=True)


def cut_off_tail() -> date:
    """
    Cut off the "tail" of our previously extracted stats, if needed, and return
    the date from which we should start looking for stats to extract.
    """
    since: date = date.today() - timedelta(
        days=HISTORY_WEEKS * 7
    )
    # ...make it a Monday
    since -= timedelta(days=since.weekday())
    latest_path: Path | None = None
    path: Path
    for path in filter(Path.is_file, DATA_PATH.iterdir()):
        name: str
        extension: str
        name, extension = path.name.rpartition(".")[::2]
        if extension.lower() == "csv":
            file_date: date = date.fromisoformat(name)
            if file_date >= TAIL_START:
                path.unlink()
            elif (file_date >= since) or (latest_path is None):
                since = file_date
                latest_path = path
    if latest_path is not None:
        # Strip tail stats from our latest CSV file
        polars.LazyFrame(read_stats(latest_path)).filter(
            polars.col("before") < TAIL_START
        ).collect().write_csv(latest_path)
        # Get the first date not covered by our CSVs, now that we've
        # cut off their tail
        since = date.fromisoformat(
            polars.read_csv(latest_path).select(polars.max("before")).item()
        )
    return since


def update_stats() -> None:
    """
    Extract stats to a CSV in the data directory as needed to bring us
    up-to-date
    """
    since: date = cut_off_tail()
    # Lookup stats, starting the day after the last one covered by our last
    # previously retrieved records, and ending yesterday
    write_stats(
        iter_stats(
            "github.com/pypa",
            password=os.environ.get(
                "",
                ""
            ),
            since=since,
            # Don't include this week, since it isn't over yet
            before=WEEK_START,
            frequency="1w",
        ),
        f"data/{since.isoformat()}.csv",
    )


update_stats()
from __future__ import annotations

import os
from datetime import date, timedelta
from pathlib import Path

import polars
from git_author_stats import iter_stats, read_stats, write_stats

TODAY: date = date.today()
WEEK_START: date = TODAY - timedelta(days=TODAY.weekday())
HISTORY_WEEKS: int = 56
STALE_WEEKS: int = 8
# Because local commits, and commits which were in non-default branches, will
# not have been incorporated into our stats when this notebook was last run,
# we'll start looking for stats the *Monday* preceding `STALE_WEEKS` weeks ago
# today, in order to pick up stats from any commits which were subsequently
# merged. The date from which we'll retrieve stats, then, will either be the
# Monday preceding our last extracted stats, or the "tail" date, whichever is
# earlier. We will also strip previously retrieved stats for this time period
# from our previously extracted data, so that we don't double-count.
TAIL_START: date = WEEK_START - timedelta(
    days=STALE_WEEKS * 7
)
DATA_PATH: Path = Path("data").absolute()
os.makedirs(DATA_PATH, exist_ok=True)


def cut_off_tail() -> date:
    """
    Cut off the "tail" of our previously extracted stats, if needed, and return
    the date from which we should start looking for stats to extract.
    """
    since: date = date.today() - timedelta(
        days=HISTORY_WEEKS * 7
    )
    # ...make it a Monday
    since -= timedelta(days=since.weekday())
    latest_path: Path | None = None
    path: Path
    for path in filter(Path.is_file, DATA_PATH.iterdir()):
        name: str
        extension: str
        name, extension = path.name.rpartition(".")[::2]
        if extension.lower() == "csv":
            file_date: date = date.fromisoformat(name)
            if file_date >= TAIL_START:
                path.unlink()
            elif (file_date >= since) or (latest_path is None):
                since = file_date
                latest_path = path
    if latest_path is not None:
        # Strip tail stats from our latest CSV file
        polars.LazyFrame(read_stats(latest_path)).filter(
            polars.col("before") < TAIL_START
        ).collect().write_csv(latest_path)
        # Get the first date not covered by our CSVs, now that we've
        # cut off their tail
        since = date.fromisoformat(
            polars.read_csv(latest_path).select(polars.max("before")).item()
        )
    return since


def update_stats() -> None:
    """
    Extract stats to a CSV in the data directory as needed to bring us
    up-to-date
    """
    since: date = cut_off_tail()
    # Lookup stats, starting the day after the last one covered by our last
    # previously retrieved records, and ending yesterday
    write_stats(
        iter_stats(
            "github.com/pypa",
            password=os.environ.get(
                "",
                ""
            ),
            since=since,
            # Don't include this week, since it isn't over yet
            before=WEEK_START,
            frequency="1w",
        ),
        f"data/{since.isoformat()}.csv",
    )


update_stats()

Plot Stats¶

In [102]:

Copied!





from collections.abc import Sequence
from datetime import date, timedelta
from functools import lru_cache
from operator import itemgetter

import matplotlib.axes
import matplotlib.figure
import matplotlib.pyplot
import matplotlib.ticker

LOCATOR: matplotlib.ticker.MultipleLocator = matplotlib.ticker.MultipleLocator(
    7.0
)


def scan_stats(
    number_of_weeks: int = HISTORY_WEEKS,
    include_author_names: tuple[str, ...] | str = (),
    exclude_author_names: tuple[str, ...] | str = (),
) -> polars.LazyFrame:
    if isinstance(include_author_names, str):
        include_author_names = (include_author_names,)
    if isinstance(exclude_author_names, str):
        exclude_author_names = (exclude_author_names,)
    stats: polars.LazyFrame = (
        polars.scan_csv("data/*.csv")
        # TODO: Modify the following filter to exclude commits/files
        # which you don't want to have contribute to your stats.
        .filter(
            # Config, JSON, requirement files, and the like are often
            # modified by project management tools
            (
                polars.col("file").str.contains(
                    r"(?i)\.(txt|ini|cfg|toml|yaml|yml|json)$"
                )
                # TODO: Modify the following to account for your service IDs
                | polars.col("author_name").str.contains(
                    r"(?i)(\[bot\]$|github-actions|dependabot)",
                )
            ).not_()
        )
        .select(
            (
                "url",
                (
                    polars.col("author_name")
                    .str.to_titlecase()
                    # TODO: Normalize author names here, if needed
                    # .replace(
                    #     "Anomalous Name Variation",
                    #     "Normalized Name",
                    # )
                ),
                polars.col("since").str.to_date(),
                polars.col("before").str.to_date(),
                "insertions",
                "deletions",
                "commit",
            )
        )
        .filter(
            polars.col("since") > date.today() - timedelta(
                days=number_of_weeks * 7
            )
        )
    )
    if include_author_names:
        stats = stats.filter(
            polars.col("author_name").is_in(include_author_names)
        )
    if exclude_author_names:
        stats = stats.filter(
            polars.col("author_name").is_in(exclude_author_names).not_()
        )
    return stats


def scan_weekly_stats(
    number_of_weeks: int = HISTORY_WEEKS,
    include_author_names: tuple[str, ...] | str = (),
    exclude_author_names: tuple[str, ...] | str = (),
) -> polars.LazyFrame:
    return (
        scan_stats(number_of_weeks, include_author_names, exclude_author_names)
        .group_by("url", "author_name", "since", "before", "commit")
        .sum()
        .select(("author_name", "since", "before", "insertions", "deletions"))
        .group_by("author_name", "since", "before")
        .sum()
        .sort(("since", "author_name"))
    )


def get_author_stats(
    number_of_weeks: int = HISTORY_WEEKS,
    include_author_names: tuple[str, ...] | str = (),
    exclude_author_names: tuple[str, ...] | str = (),
    author_minimum_weeks: int = STALE_WEEKS,
) -> polars.DataFrame:
    """
    Get a data frame with average weekly insertions/deletions for all
    authors who have contributions spanning over at least the specified number
    of weeks.
    """
    authors_lazy_frame: polars.LazyFrame = scan_authors(
        number_of_weeks=number_of_weeks,
        include_author_names=include_author_names,
        exclude_author_names=exclude_author_names,
        author_minimum_weeks=author_minimum_weeks,
    )
    authors_data_frame: polars.DataFrame = (
        scan_weekly_stats(
            number_of_weeks,
            include_author_names,
            exclude_author_names,
        )
        .join(
            authors_lazy_frame,
            on="author_name",
        )
        .select(
            (
                "author_name",
                "since",
                "before",
                "insertions",
                "deletions",
            )
        )
        .group_by(
            "author_name"
        ).agg(
            since=polars.col("since").min(),
            before=polars.col("before").max(),
            insertions=polars.sum("insertions"),
            deletions=polars.sum("deletions"),
            active_weeks=polars.col("since").count()
        ).collect()
    )
    # Get the number of weeks each author has been active.
    # Here we use a start and end date rather than a count of weeks
    # with active contributions, since a contribution may be worked on
    # over the course of several weeks. So long as employment is mostly
    # continuous, this will provide meaningful metrics. If the evaluated
    # time period includes at least a year's history, and employees
    # have a similar number of vacation days/weeks, vacation gaps won't
    # skew our stats too much either. Longer employment gaps, such as for
    # sabbaticals and/or laid-off/re-hired authors, however, will
    # meaningfully degrade that author's stats.
    # TODO: Account for long employment gaps, if needed.
    authors_data_frame = authors_data_frame.with_columns(
        weeks=(
            (polars.col("before") - polars.col("since")).dt.total_days() / 7
        ).cast(int)
    ).with_columns(
        weekly_insertions=(
            polars.col("insertions") / polars.col("weeks")
        ).cast(int),
        weekly_deletions=(polars.col("deletions") / polars.col("weeks")).cast(
            int
        ),
    )
    return authors_data_frame.sort("author_name", descending=True)


def get_weekly_stats(
    number_of_weeks: int = HISTORY_WEEKS,
    include_author_names: tuple[str, ...] | str = (),
    exclude_author_names: tuple[str, ...] | str = (),
    author_minimum_weeks: int = STALE_WEEKS,
) -> polars.DataFrame:
    """
    Get a data frame with insertions/deletions per week, by author.
    """
    return (
        scan_weekly_stats(
            number_of_weeks, include_author_names, exclude_author_names
        ).join(
            scan_authors(
                number_of_weeks=number_of_weeks,
                include_author_names=include_author_names,
                exclude_author_names=exclude_author_names,
                author_minimum_weeks=author_minimum_weeks,
            ),
            on="author_name"
        )
        .group_by("since", "author_name")
        .sum()
        .sort("since")
        .select(
            (
                polars.col("since").alias("Week"),
                polars.col("author_name").alias("Author"),
                polars.col("insertions").alias("Insertions"),
                polars.col("deletions").alias("Deletions"),
            )
        )
    ).collect()


def get_sum_weekly_stats(
    number_of_weeks: int = HISTORY_WEEKS,
    include_author_names: tuple[str, ...] | str = (),
    exclude_author_names: tuple[str, ...] | str = (),
    author_minimum_weeks: int = STALE_WEEKS,
) -> polars.DataFrame:
    """
    Get a data frame with total insertions/deletions per week, for all authors.
    """
    return (
        scan_weekly_stats(
            number_of_weeks, include_author_names, exclude_author_names
        ).join(
            scan_authors(
                number_of_weeks=number_of_weeks,
                include_author_names=include_author_names,
                exclude_author_names=exclude_author_names,
                author_minimum_weeks=author_minimum_weeks,
            ),
            on="author_name"
        )
        .group_by("since")
        .sum()
        .sort("since")
        .select(
            (
                polars.col("since").alias("Week"),
                polars.col("insertions").alias("Insertions"),
                polars.col("deletions").alias("Deletions"),
            )
        )
    ).collect()


def get_author_label(
    include_author_names: tuple[str, ...] | str = (),
    exclude_author_names: tuple[str, ...] | str = (),
) -> str:
    """
    Get the label to use based on included/excluded author names
    """
    if isinstance(include_author_names, str):
        include_author_names = (include_author_names,)
    if isinstance(exclude_author_names, str):
        exclude_author_names = (exclude_author_names,)
    label: list[str] = []
    author_names: tuple[str, ...] = ()
    if exclude_author_names:
        if include_author_names:
            exclude_author_names_set: frozenset[str] = frozenset(
                exclude_author_names
            )
            author_names = tuple(
                author_name
                for author_name in include_author_names
                if author_name not in exclude_author_names_set
            )
        else:
            label.append("All Active Authors Except")
            author_names = exclude_author_names
    else:
        if include_author_names:
            author_names = include_author_names
        else:
            label.append("All Active Authors")
    if author_names:
        if len(author_names) == 1:
            label += (author_names[0],)
        else:
            label += "{} and {}".format(
                ", ".join(author_names[:-1]),
                author_names[-1],
            )
    return " ".join(label)


def plot_weekly_sum(
    number_of_weeks: int = 32,
    include_author_names: tuple[str, ...] | str = (),
    exclude_author_names: tuple[str, ...] | str = (),
    author_minimum_weeks: int = STALE_WEEKS,
) -> matplotlib.axes.Axes:
    author_label = get_author_label(include_author_names, exclude_author_names)
    axes: matplotlib.axes.Axes = (
        get_sum_weekly_stats(
            number_of_weeks,
            include_author_names,
            exclude_author_names,
            author_minimum_weeks=author_minimum_weeks
        )
        .to_pandas()
        .plot.area(
            title=(
                f"{author_label} - Weekly Total - "
                f"Past {number_of_weeks} Weeks"
            ),
            x="Week",
            y=["Insertions", "Deletions"],
            figsize=(32, 4),
            rot=90,
            xlabel="Week",
            ylabel="Insertions + Deletions",
            x_compat=True,
        )
    )
    axes.xaxis.set_major_locator(LOCATOR)
    axes.xaxis.set_ticks_position("none")
    return axes


def get_mean_weekly_stats(
    number_of_weeks: int = HISTORY_WEEKS,
    include_author_names: tuple[str, ...] | str = (),
    exclude_author_names: tuple[str, ...] | str = (),
    author_minimum_weeks: int = STALE_WEEKS,
) -> polars.DataFrame:
    return (
        scan_weekly_stats(
            number_of_weeks,
            include_author_names,
            exclude_author_names,
        ).join(
            scan_authors(
                number_of_weeks=number_of_weeks,
                include_author_names=include_author_names,
                exclude_author_names=exclude_author_names,
                author_minimum_weeks=author_minimum_weeks,
            ),
            on="author_name"
        )
        .group_by("since")
        .mean()
        .sort("since")
        .select(
            (
                polars.col("since").alias("Week"),
                polars.col("insertions").alias("Insertions"),
                polars.col("deletions").alias("Deletions"),
            )
        )
        .collect()
    )


def plot_weekly_mean(
    number_of_weeks: int = HISTORY_WEEKS,
    include_author_names: tuple[str, ...] | str = (),
    exclude_author_names: tuple[str, ...] | str = (),
    author_minimum_weeks: int = STALE_WEEKS,
) -> matplotlib.axes.Axes:
    author_label = get_author_label(include_author_names, exclude_author_names)
    axes: matplotlib.axes.Axes = (
        get_mean_weekly_stats(
            number_of_weeks,
            include_author_names,
            exclude_author_names,
            author_minimum_weeks=author_minimum_weeks,
        )
        .to_pandas()
        .plot.area(
            title=(
                f"{author_label} - Weekly per/Author Average - "
                f"Past {number_of_weeks} Weeks"
            ),
            x="Week",
            y=["Insertions", "Deletions"],
            figsize=(24, 6),
            rot=90,
            xlabel="Week",
            ylabel="Insertions",
            x_compat=True,
        )
    )
    axes.xaxis.set_major_locator(LOCATOR)
    axes.xaxis.set_ticks_position("none")
    return axes


def scan_authors(
    number_of_weeks: int = HISTORY_WEEKS,
    include_author_names: tuple[str, ...] | str = (),
    exclude_author_names: tuple[str, ...] | str = (),
    author_minimum_weeks: int = STALE_WEEKS,
) -> polars.LazyFrame:
    """
    Get a lazy frame with the names of all authors who have contributed
    over at least the specified number of weeks.
    """
    return (
        scan_weekly_stats(
            number_of_weeks, include_author_names, exclude_author_names
        )
        .group_by("author_name", "since")
        .agg()
        .group_by("author_name")
        .agg(
            weeks=polars.col("since").count()
        )
        .filter(
            polars.col("weeks") >= author_minimum_weeks
        )
        # We only need the author names, so we can drop the rest of the
        .select(
           polars.col("author_name")
        )
    )


@lru_cache
def count_authors(
    number_of_weeks: int = HISTORY_WEEKS,
    include_author_names: tuple[str, ...] | str = (),
    exclude_author_names: tuple[str, ...] | str = (),
    author_minimum_weeks: int = STALE_WEEKS,
) -> int:
    """
    Return the number of authors
    """
    return scan_authors(
        number_of_weeks,
        include_author_names=include_author_names,
        exclude_author_names=exclude_author_names,
        author_minimum_weeks=author_minimum_weeks,
    ).collect().height


def plot_author_weekly_insertions(
    number_of_weeks: int = HISTORY_WEEKS,
    include_author_names: tuple[str, ...] | str = (),
    exclude_author_names: tuple[str, ...] | str = (),
    author_minimum_weeks: int = STALE_WEEKS,
) -> Sequence[matplotlib.axes.Axes]:
    """
    Plot weekly insertions for all authors over the past `number_of_weeks`
    """
    author_label: str = get_author_label(
        include_author_names,
        exclude_author_names,
    )
    axes: matplotlib.axes.Axes
    number_of_authors: int = count_authors(
        number_of_weeks,
        include_author_names,
        exclude_author_names,
        author_minimum_weeks=author_minimum_weeks
    )
    authors_axes: Sequence[matplotlib.axes.Axes] = (
        get_weekly_stats(
            number_of_weeks,
            include_author_names,
            exclude_author_names,
            author_minimum_weeks,
        )
        .to_pandas()
        .pivot(index="Week", columns="Author", values="Insertions")
        .plot.area(
            title=(
                f"Weekly Insertions - Past {number_of_weeks} Weeks - "
                f"{author_label}"
            ),
            figsize=(
                24,
                number_of_authors
            ),
            rot=90,
            x_compat=True,
            subplots=True,
        )
    )
    # Get the minimum and maximum y-bounds for all axes, in order to display
    # all data at the same scale
    ybounds: tuple[tuple[float, float], ...] = tuple(
        map(matplotlib.axes.Axes.get_ybound, authors_axes)
    )
    ybound: tuple[float, float] = (
        max(min(map(itemgetter(0), ybounds)), 0),
        max(map(itemgetter(1), ybounds)),
    )
    # Set the y-bounds for all axes to the same values
    for axes in authors_axes:
        axes.set_ybound(*ybound)
        axes.xaxis.set_ticks_position("none")
        axes.autoscale(enable=True, axis="y", tight=True)
    authors_axes[-1].xaxis.set_major_locator(LOCATOR)
    return authors_axes


def plot_author_means(
    number_of_weeks: int = HISTORY_WEEKS,
    include_author_names: tuple[str, ...] | str = (),
    exclude_author_names: tuple[str, ...] | str = (),
    author_minimum_weeks: int = STALE_WEEKS,
) -> tuple[
    matplotlib.figure.Figure,
    matplotlib.axes.Axes
]:
    """
    Plot the weekly average insertions and deletions for all authors over
    the past `numbers_of_weeks`.
    """
    author_label: str = get_author_label(
        include_author_names,
        exclude_author_names,
    )
    number_of_authors: int = count_authors(
        number_of_weeks,
        include_author_names,
        exclude_author_names,
        author_minimum_weeks=author_minimum_weeks
    )
    return get_author_stats(
        number_of_weeks=number_of_weeks,
        include_author_names=include_author_names,
        exclude_author_names=exclude_author_names,
        author_minimum_weeks=author_minimum_weeks,
    ).select(
        polars.col("author_name").alias("Author"),
        polars.col("weekly_insertions").alias("Weekly Insertions"),
        polars.col("weekly_deletions").alias("Weekly Deletions"),
    ).to_pandas().plot.barh(
        x="Author",
        y=["Weekly Insertions", "Weekly Deletions"],
        stacked=True,
        title=(
            f"Past {number_of_weeks} Weeks: "
            f"Weekly Insertions and Deletions - {author_label}"
        ),
        figsize=(24, number_of_authors / 4),
    )


plot_author_means()
plot_weekly_mean()
plot_weekly_sum()
plot_author_weekly_insertions()
matplotlib.pyplot.show()
from collections.abc import Sequence
from datetime import date, timedelta
from functools import lru_cache
from operator import itemgetter

import matplotlib.axes
import matplotlib.figure
import matplotlib.pyplot
import matplotlib.ticker

LOCATOR: matplotlib.ticker.MultipleLocator = matplotlib.ticker.MultipleLocator(
    7.0
)


def scan_stats(
    number_of_weeks: int = HISTORY_WEEKS,
    include_author_names: tuple[str, ...] | str = (),
    exclude_author_names: tuple[str, ...] | str = (),
) -> polars.LazyFrame:
    if isinstance(include_author_names, str):
        include_author_names = (include_author_names,)
    if isinstance(exclude_author_names, str):
        exclude_author_names = (exclude_author_names,)
    stats: polars.LazyFrame = (
        polars.scan_csv("data/*.csv")
        # TODO: Modify the following filter to exclude commits/files
        # which you don't want to have contribute to your stats.
        .filter(
            # Config, JSON, requirement files, and the like are often
            # modified by project management tools
            (
                polars.col("file").str.contains(
                    r"(?i)\.(txt|ini|cfg|toml|yaml|yml|json)$"
                )
                # TODO: Modify the following to account for your service IDs
                | polars.col("author_name").str.contains(
                    r"(?i)(\[bot\]$|github-actions|dependabot)",
                )
            ).not_()
        )
        .select(
            (
                "url",
                (
                    polars.col("author_name")
                    .str.to_titlecase()
                    # TODO: Normalize author names here, if needed
                    # .replace(
                    #     "Anomalous Name Variation",
                    #     "Normalized Name",
                    # )
                ),
                polars.col("since").str.to_date(),
                polars.col("before").str.to_date(),
                "insertions",
                "deletions",
                "commit",
            )
        )
        .filter(
            polars.col("since") > date.today() - timedelta(
                days=number_of_weeks * 7
            )
        )
    )
    if include_author_names:
        stats = stats.filter(
            polars.col("author_name").is_in(include_author_names)
        )
    if exclude_author_names:
        stats = stats.filter(
            polars.col("author_name").is_in(exclude_author_names).not_()
        )
    return stats


def scan_weekly_stats(
    number_of_weeks: int = HISTORY_WEEKS,
    include_author_names: tuple[str, ...] | str = (),
    exclude_author_names: tuple[str, ...] | str = (),
) -> polars.LazyFrame:
    return (
        scan_stats(number_of_weeks, include_author_names, exclude_author_names)
        .group_by("url", "author_name", "since", "before", "commit")
        .sum()
        .select(("author_name", "since", "before", "insertions", "deletions"))
        .group_by("author_name", "since", "before")
        .sum()
        .sort(("since", "author_name"))
    )


def get_author_stats(
    number_of_weeks: int = HISTORY_WEEKS,
    include_author_names: tuple[str, ...] | str = (),
    exclude_author_names: tuple[str, ...] | str = (),
    author_minimum_weeks: int = STALE_WEEKS,
) -> polars.DataFrame:
    """
    Get a data frame with average weekly insertions/deletions for all
    authors who have contributions spanning over at least the specified number
    of weeks.
    """
    authors_lazy_frame: polars.LazyFrame = scan_authors(
        number_of_weeks=number_of_weeks,
        include_author_names=include_author_names,
        exclude_author_names=exclude_author_names,
        author_minimum_weeks=author_minimum_weeks,
    )
    authors_data_frame: polars.DataFrame = (
        scan_weekly_stats(
            number_of_weeks,
            include_author_names,
            exclude_author_names,
        )
        .join(
            authors_lazy_frame,
            on="author_name",
        )
        .select(
            (
                "author_name",
                "since",
                "before",
                "insertions",
                "deletions",
            )
        )
        .group_by(
            "author_name"
        ).agg(
            since=polars.col("since").min(),
            before=polars.col("before").max(),
            insertions=polars.sum("insertions"),
            deletions=polars.sum("deletions"),
            active_weeks=polars.col("since").count()
        ).collect()
    )
    # Get the number of weeks each author has been active.
    # Here we use a start and end date rather than a count of weeks
    # with active contributions, since a contribution may be worked on
    # over the course of several weeks. So long as employment is mostly
    # continuous, this will provide meaningful metrics. If the evaluated
    # time period includes at least a year's history, and employees
    # have a similar number of vacation days/weeks, vacation gaps won't
    # skew our stats too much either. Longer employment gaps, such as for
    # sabbaticals and/or laid-off/re-hired authors, however, will
    # meaningfully degrade that author's stats.
    # TODO: Account for long employment gaps, if needed.
    authors_data_frame = authors_data_frame.with_columns(
        weeks=(
            (polars.col("before") - polars.col("since")).dt.total_days() / 7
        ).cast(int)
    ).with_columns(
        weekly_insertions=(
            polars.col("insertions") / polars.col("weeks")
        ).cast(int),
        weekly_deletions=(polars.col("deletions") / polars.col("weeks")).cast(
            int
        ),
    )
    return authors_data_frame.sort("author_name", descending=True)


def get_weekly_stats(
    number_of_weeks: int = HISTORY_WEEKS,
    include_author_names: tuple[str, ...] | str = (),
    exclude_author_names: tuple[str, ...] | str = (),
    author_minimum_weeks: int = STALE_WEEKS,
) -> polars.DataFrame:
    """
    Get a data frame with insertions/deletions per week, by author.
    """
    return (
        scan_weekly_stats(
            number_of_weeks, include_author_names, exclude_author_names
        ).join(
            scan_authors(
                number_of_weeks=number_of_weeks,
                include_author_names=include_author_names,
                exclude_author_names=exclude_author_names,
                author_minimum_weeks=author_minimum_weeks,
            ),
            on="author_name"
        )
        .group_by("since", "author_name")
        .sum()
        .sort("since")
        .select(
            (
                polars.col("since").alias("Week"),
                polars.col("author_name").alias("Author"),
                polars.col("insertions").alias("Insertions"),
                polars.col("deletions").alias("Deletions"),
            )
        )
    ).collect()


def get_sum_weekly_stats(
    number_of_weeks: int = HISTORY_WEEKS,
    include_author_names: tuple[str, ...] | str = (),
    exclude_author_names: tuple[str, ...] | str = (),
    author_minimum_weeks: int = STALE_WEEKS,
) -> polars.DataFrame:
    """
    Get a data frame with total insertions/deletions per week, for all authors.
    """
    return (
        scan_weekly_stats(
            number_of_weeks, include_author_names, exclude_author_names
        ).join(
            scan_authors(
                number_of_weeks=number_of_weeks,
                include_author_names=include_author_names,
                exclude_author_names=exclude_author_names,
                author_minimum_weeks=author_minimum_weeks,
            ),
            on="author_name"
        )
        .group_by("since")
        .sum()
        .sort("since")
        .select(
            (
                polars.col("since").alias("Week"),
                polars.col("insertions").alias("Insertions"),
                polars.col("deletions").alias("Deletions"),
            )
        )
    ).collect()


def get_author_label(
    include_author_names: tuple[str, ...] | str = (),
    exclude_author_names: tuple[str, ...] | str = (),
) -> str:
    """
    Get the label to use based on included/excluded author names
    """
    if isinstance(include_author_names, str):
        include_author_names = (include_author_names,)
    if isinstance(exclude_author_names, str):
        exclude_author_names = (exclude_author_names,)
    label: list[str] = []
    author_names: tuple[str, ...] = ()
    if exclude_author_names:
        if include_author_names:
            exclude_author_names_set: frozenset[str] = frozenset(
                exclude_author_names
            )
            author_names = tuple(
                author_name
                for author_name in include_author_names
                if author_name not in exclude_author_names_set
            )
        else:
            label.append("All Active Authors Except")
            author_names = exclude_author_names
    else:
        if include_author_names:
            author_names = include_author_names
        else:
            label.append("All Active Authors")
    if author_names:
        if len(author_names) == 1:
            label += (author_names[0],)
        else:
            label += "{} and {}".format(
                ", ".join(author_names[:-1]),
                author_names[-1],
            )
    return " ".join(label)


def plot_weekly_sum(
    number_of_weeks: int = 32,
    include_author_names: tuple[str, ...] | str = (),
    exclude_author_names: tuple[str, ...] | str = (),
    author_minimum_weeks: int = STALE_WEEKS,
) -> matplotlib.axes.Axes:
    author_label = get_author_label(include_author_names, exclude_author_names)
    axes: matplotlib.axes.Axes = (
        get_sum_weekly_stats(
            number_of_weeks,
            include_author_names,
            exclude_author_names,
            author_minimum_weeks=author_minimum_weeks
        )
        .to_pandas()
        .plot.area(
            title=(
                f"{author_label} - Weekly Total - "
                f"Past {number_of_weeks} Weeks"
            ),
            x="Week",
            y=["Insertions", "Deletions"],
            figsize=(32, 4),
            rot=90,
            xlabel="Week",
            ylabel="Insertions + Deletions",
            x_compat=True,
        )
    )
    axes.xaxis.set_major_locator(LOCATOR)
    axes.xaxis.set_ticks_position("none")
    return axes


def get_mean_weekly_stats(
    number_of_weeks: int = HISTORY_WEEKS,
    include_author_names: tuple[str, ...] | str = (),
    exclude_author_names: tuple[str, ...] | str = (),
    author_minimum_weeks: int = STALE_WEEKS,
) -> polars.DataFrame:
    return (
        scan_weekly_stats(
            number_of_weeks,
            include_author_names,
            exclude_author_names,
        ).join(
            scan_authors(
                number_of_weeks=number_of_weeks,
                include_author_names=include_author_names,
                exclude_author_names=exclude_author_names,
                author_minimum_weeks=author_minimum_weeks,
            ),
            on="author_name"
        )
        .group_by("since")
        .mean()
        .sort("since")
        .select(
            (
                polars.col("since").alias("Week"),
                polars.col("insertions").alias("Insertions"),
                polars.col("deletions").alias("Deletions"),
            )
        )
        .collect()
    )


def plot_weekly_mean(
    number_of_weeks: int = HISTORY_WEEKS,
    include_author_names: tuple[str, ...] | str = (),
    exclude_author_names: tuple[str, ...] | str = (),
    author_minimum_weeks: int = STALE_WEEKS,
) -> matplotlib.axes.Axes:
    author_label = get_author_label(include_author_names, exclude_author_names)
    axes: matplotlib.axes.Axes = (
        get_mean_weekly_stats(
            number_of_weeks,
            include_author_names,
            exclude_author_names,
            author_minimum_weeks=author_minimum_weeks,
        )
        .to_pandas()
        .plot.area(
            title=(
                f"{author_label} - Weekly per/Author Average - "
                f"Past {number_of_weeks} Weeks"
            ),
            x="Week",
            y=["Insertions", "Deletions"],
            figsize=(24, 6),
            rot=90,
            xlabel="Week",
            ylabel="Insertions",
            x_compat=True,
        )
    )
    axes.xaxis.set_major_locator(LOCATOR)
    axes.xaxis.set_ticks_position("none")
    return axes


def scan_authors(
    number_of_weeks: int = HISTORY_WEEKS,
    include_author_names: tuple[str, ...] | str = (),
    exclude_author_names: tuple[str, ...] | str = (),
    author_minimum_weeks: int = STALE_WEEKS,
) -> polars.LazyFrame:
    """
    Get a lazy frame with the names of all authors who have contributed
    over at least the specified number of weeks.
    """
    return (
        scan_weekly_stats(
            number_of_weeks, include_author_names, exclude_author_names
        )
        .group_by("author_name", "since")
        .agg()
        .group_by("author_name")
        .agg(
            weeks=polars.col("since").count()
        )
        .filter(
            polars.col("weeks") >= author_minimum_weeks
        )
        # We only need the author names, so we can drop the rest of the
        .select(
           polars.col("author_name")
        )
    )


@lru_cache
def count_authors(
    number_of_weeks: int = HISTORY_WEEKS,
    include_author_names: tuple[str, ...] | str = (),
    exclude_author_names: tuple[str, ...] | str = (),
    author_minimum_weeks: int = STALE_WEEKS,
) -> int:
    """
    Return the number of authors
    """
    return scan_authors(
        number_of_weeks,
        include_author_names=include_author_names,
        exclude_author_names=exclude_author_names,
        author_minimum_weeks=author_minimum_weeks,
    ).collect().height


def plot_author_weekly_insertions(
    number_of_weeks: int = HISTORY_WEEKS,
    include_author_names: tuple[str, ...] | str = (),
    exclude_author_names: tuple[str, ...] | str = (),
    author_minimum_weeks: int = STALE_WEEKS,
) -> Sequence[matplotlib.axes.Axes]:
    """
    Plot weekly insertions for all authors over the past `number_of_weeks`
    """
    author_label: str = get_author_label(
        include_author_names,
        exclude_author_names,
    )
    axes: matplotlib.axes.Axes
    number_of_authors: int = count_authors(
        number_of_weeks,
        include_author_names,
        exclude_author_names,
        author_minimum_weeks=author_minimum_weeks
    )
    authors_axes: Sequence[matplotlib.axes.Axes] = (
        get_weekly_stats(
            number_of_weeks,
            include_author_names,
            exclude_author_names,
            author_minimum_weeks,
        )
        .to_pandas()
        .pivot(index="Week", columns="Author", values="Insertions")
        .plot.area(
            title=(
                f"Weekly Insertions - Past {number_of_weeks} Weeks - "
                f"{author_label}"
            ),
            figsize=(
                24,
                number_of_authors
            ),
            rot=90,
            x_compat=True,
            subplots=True,
        )
    )
    # Get the minimum and maximum y-bounds for all axes, in order to display
    # all data at the same scale
    ybounds: tuple[tuple[float, float], ...] = tuple(
        map(matplotlib.axes.Axes.get_ybound, authors_axes)
    )
    ybound: tuple[float, float] = (
        max(min(map(itemgetter(0), ybounds)), 0),
        max(map(itemgetter(1), ybounds)),
    )
    # Set the y-bounds for all axes to the same values
    for axes in authors_axes:
        axes.set_ybound(*ybound)
        axes.xaxis.set_ticks_position("none")
        axes.autoscale(enable=True, axis="y", tight=True)
    authors_axes[-1].xaxis.set_major_locator(LOCATOR)
    return authors_axes


def plot_author_means(
    number_of_weeks: int = HISTORY_WEEKS,
    include_author_names: tuple[str, ...] | str = (),
    exclude_author_names: tuple[str, ...] | str = (),
    author_minimum_weeks: int = STALE_WEEKS,
) -> tuple[
    matplotlib.figure.Figure,
    matplotlib.axes.Axes
]:
    """
    Plot the weekly average insertions and deletions for all authors over
    the past `numbers_of_weeks`.
    """
    author_label: str = get_author_label(
        include_author_names,
        exclude_author_names,
    )
    number_of_authors: int = count_authors(
        number_of_weeks,
        include_author_names,
        exclude_author_names,
        author_minimum_weeks=author_minimum_weeks
    )
    return get_author_stats(
        number_of_weeks=number_of_weeks,
        include_author_names=include_author_names,
        exclude_author_names=exclude_author_names,
        author_minimum_weeks=author_minimum_weeks,
    ).select(
        polars.col("author_name").alias("Author"),
        polars.col("weekly_insertions").alias("Weekly Insertions"),
        polars.col("weekly_deletions").alias("Weekly Deletions"),
    ).to_pandas().plot.barh(
        x="Author",
        y=["Weekly Insertions", "Weekly Deletions"],
        stacked=True,
        title=(
            f"Past {number_of_weeks} Weeks: "
            f"Weekly Insertions and Deletions - {author_label}"
        ),
        figsize=(24, number_of_authors / 4),
    )


plot_author_means()
plot_weekly_mean()
plot_weekly_sum()
plot_author_weekly_insertions()
matplotlib.pyplot.show()

No description has been provided for this image

In [ ]: