Analyse & plot¶

Summarise a literature over time, compare topics within it, and turn the summaries into figures.

scopus_trend ¶

scopus_trend(query, years, view='STANDARD', **kwargs)

Count Scopus hits for query in each of years without downloading them.

Each year is a cheap result-size lookup, so this gives a publication trend far faster than harvesting every record.

Source code in src/scopusflow/trend.py

def scopus_trend(
    query: str,
    years: Sequence[int],
    view: str = "STANDARD",
    **kwargs,
) -> pd.DataFrame:
    """Count Scopus hits for ``query`` in each of ``years`` without downloading them.

    Each year is a cheap result-size lookup, so this gives a publication trend
    far faster than harvesting every record.
    """
    if not query or not query.strip():
        raise ValueError("query must be a non-empty string.")
    years = list(years)
    if not years:
        raise ValueError("years must be a non-empty sequence.")

    from pybliometrics.scopus import ScopusSearch  # imported lazily; needs a key

    counts: dict[int, int] = {}
    for y in years:
        search = ScopusSearch(
            f"{query} AND PUBYEAR IS {y}", view=view, download=False, **kwargs
        )
        counts[int(y)] = int(search.get_results_size())
    return _trend_frame(counts)

year_counts ¶

year_counts(records)

Count records per publication year, dropping rows with a missing year.

Returns a :data:TREND_COLUMNS frame sorted ascending by year, with both columns as plain integers.

Source code in src/scopusflow/trend.py

def year_counts(records: pd.DataFrame) -> pd.DataFrame:
    """Count records per publication year, dropping rows with a missing year.

    Returns a :data:`TREND_COLUMNS` frame sorted ascending by year, with both
    columns as plain integers.
    """
    years = pd.to_numeric(records["year"], errors="coerce").dropna()
    counts = {int(y): int(n) for y, n in years.astype(int).value_counts().items()}
    return _trend_frame(counts)

compare_topics ¶

compare_topics(reference_query, comparison_terms, years, field=None, view='STANDARD', **kwargs)

Compare comparison topics against a reference topic over the years.

Returns a :data:COMPARISON_COLUMNS frame. One count request per term per year, so keep the term and year counts modest to stay within quota.

Source code in src/scopusflow/compare.py

def compare_topics(reference_query: str, comparison_terms, years: Sequence[int],
                   field: Optional[str] = None, view: str = "STANDARD",
                   **kwargs) -> pd.DataFrame:
    """Compare comparison topics against a reference topic over the years.

    Returns a :data:`COMPARISON_COLUMNS` frame. One count request per term per
    year, so keep the term and year counts modest to stay within quota.
    """
    if not reference_query or not str(reference_query).strip():
        raise ValueError("reference_query must be a non-empty string.")
    if isinstance(comparison_terms, str):
        comparison_terms = [comparison_terms]
    terms = [str(t).strip() for t in comparison_terms]
    if not terms or any(not t for t in terms):
        raise ValueError("comparison_terms must be a non-empty list of non-empty terms.")
    if years is None or not list(years):
        raise ValueError("years must be a non-empty sequence.")
    ys = []
    for y in years:
        yi = int(y)
        if float(y) != yi or not (1700 <= yi <= 2200):
            raise ValueError("years must be whole numbers between 1700 and 2200.")
        ys.append(yi)
    ys = sorted(set(ys))

    from pybliometrics.scopus import ScopusSearch  # imported lazily; needs a key

    def size(query: str, year: int) -> int:
        full = f"{query} AND PUBYEAR IS {year}"
        return int(ScopusSearch(full, view=view, download=False, **kwargs).get_results_size())

    ref_query = wrap_field(str(reference_query).strip(), field)
    # One count step per term, plus the reference; logged as "Cell k/N:" so the
    # app's progress parser can drive a bar (mirrors the R verbose output).
    total = len(terms) + 1
    logger.info("Cell 1/%d: counting reference across %d year(s)", total, len(ys))
    ref_counts = {y: size(ref_query, y) for y in ys}

    comparison = []
    for i, term in enumerate(terms):
        logger.info("Cell %d/%d: counting '%s'", i + 2, total, term)
        cmp_query = f"{ref_query} AND {wrap_field(term, field)}"
        comparison.append((term, cmp_query, {y: size(cmp_query, y) for y in ys}))

    return _assemble(str(reference_query).strip(), ref_query, ref_counts, comparison, ys)

COMPARISON_COLUMNS `module-attribute` ¶

COMPARISON_COLUMNS = ['query', 'query_type', 'abridged_query', 'year', 'n', 'reference_n', 'comparison_percentage', 'average_comparison_percentage']

plot_trend ¶

plot_trend(trend, ax=None)

Plot publication counts over time as a filled area, line and points.

trend has columns ["year", "n"]; returns the matplotlib Axes.

Source code in src/scopusflow/plots.py

def plot_trend(trend: pd.DataFrame, ax=None):
    """Plot publication counts over time as a filled area, line and points.

    ``trend`` has columns ``["year", "n"]``; returns the matplotlib ``Axes``.
    """
    import matplotlib.pyplot as plt

    if ax is None:
        _, ax = plt.subplots()

    years = trend["year"]
    counts = trend["n"]
    ax.fill_between(years, counts, alpha=0.16, color=_TREND_COLOUR)
    ax.plot(years, counts, color=_TREND_COLOUR, linewidth=2)
    ax.scatter(years, counts, color=_TREND_COLOUR, s=18, zorder=3)
    ax.set_ylim(bottom=0)
    ax.set_xlabel("Year")
    ax.set_ylabel("Records")
    _clean_axes(ax)
    return ax

plot_top ¶

plot_top(top, ax=None)

Plot a horizontal bar chart of the most frequent values, largest on top.

top has columns ["value", "n"] (from :func:scopusflow.records.top); returns the matplotlib Axes.

Source code in src/scopusflow/plots.py

def plot_top(top: pd.DataFrame, ax=None):
    """Plot a horizontal bar chart of the most frequent values, largest on top.

    ``top`` has columns ``["value", "n"]`` (from :func:`scopusflow.records.top`);
    returns the matplotlib ``Axes``.
    """
    import matplotlib.pyplot as plt

    if ax is None:
        _, ax = plt.subplots()

    # Reverse so the largest count sits at the top of the chart.
    ordered = top.iloc[::-1]
    ax.barh(ordered["value"].astype(str), ordered["n"], color=_TOP_COLOUR)
    ax.set_xlabel("Records")
    ax.set_ylabel("")
    _clean_axes(ax)
    return ax

plot_comparison ¶

plot_comparison(comparison, highlight=None, interval=True, counts_in_legend=True, ax=None)

Plot each comparison topic's share of the reference literature over time.

comparison is the frame from :func:scopusflow.compare.compare_topics. With interval a shaded Wilson band shows how stable each yearly share is (illustrative, not a confidence interval — Scopus counts are exact). highlight names one topic to draw in an accent colour, the rest in grey. With counts_in_legend (the default) each label carries the topic's total record count, for example machine learning (n = 1,204). Returns the matplotlib Axes.

Source code in src/scopusflow/plots.py

def plot_comparison(comparison: pd.DataFrame, highlight=None, interval: bool = True,
                    counts_in_legend: bool = True, ax=None):
    """Plot each comparison topic's share of the reference literature over time.

    ``comparison`` is the frame from :func:`scopusflow.compare.compare_topics`.
    With ``interval`` a shaded Wilson band shows how stable each yearly share is
    (illustrative, not a confidence interval — Scopus counts are exact).
    ``highlight`` names one topic to draw in an accent colour, the rest in grey.
    With ``counts_in_legend`` (the default) each label carries the topic's total
    record count, for example ``machine learning (n = 1,204)``. Returns the
    matplotlib ``Axes``.
    """
    import matplotlib.pyplot as plt
    import matplotlib.ticker as mticker

    required = {"query_type", "abridged_query", "year", "comparison_percentage"}
    if not required.issubset(comparison.columns):
        raise ValueError("comparison must be a topic-comparison frame.")

    comp_all = comparison[comparison["query_type"] == "comparison"]
    # A year whose reference has no records carries no defined share; it is
    # dropped and noted in the caption, mirroring the R plot.
    n_missing = int(comp_all["comparison_percentage"].isna().sum())
    df = comp_all[comp_all["comparison_percentage"].notna()].copy()
    if df.empty:
        raise ValueError("comparison has no comparison topics with a finite share to plot.")

    # The reference topic names the subtitle (it is the 100% denominator).
    ref_rows = comparison[comparison["query_type"] == "reference"]
    ref_names = ref_rows["abridged_query"].dropna().unique() if len(ref_rows) else []
    ref_label = str(ref_names[0]) if len(ref_names) == 1 else None

    order = (df.groupby("abridged_query")["average_comparison_percentage"].first()
             .reset_index()
             .sort_values(["average_comparison_percentage", "abridged_query"],
                          ascending=[False, True]))
    topics = list(order["abridged_query"])
    if highlight is not None and highlight not in topics:
        raise ValueError(f"highlight must be one of: {', '.join(topics)}.")

    # Optionally append each topic's total record count to its label.
    has_counts = counts_in_legend and "n" in df.columns
    totals = df.groupby("abridged_query")["n"].sum() if has_counts else None

    def _label(topic):
        return f"{topic} (n = {int(totals[topic]):,})" if has_counts else topic

    if ax is None:
        _, ax = plt.subplots()
    cmap = plt.get_cmap("viridis")
    spread = max(len(topics) - 1, 1)
    has_band = interval and {"n", "reference_n"}.issubset(df.columns)
    label_points = []

    for i, topic in enumerate(topics):
        sub = df[df["abridged_query"] == topic].sort_values("year")
        is_hi = highlight == topic
        if highlight is not None:
            colour = "#BB5566" if is_hi else "#BFBFBF"
            width = 1.6 if is_hi else 0.8
        else:
            colour = cmap(0.05 + 0.8 * i / spread)
            width = 1.4
        if has_band and (highlight is None or is_hi):
            lo, up = _wilson(sub["n"].to_numpy(), sub["reference_n"].to_numpy())
            ax.fill_between(sub["year"], lo, up, color=colour, alpha=0.16, linewidth=0)
        ax.plot(sub["year"], sub["comparison_percentage"], color=colour,
                linewidth=width, label=_label(topic))
        ax.scatter(sub["year"], sub["comparison_percentage"], color=colour, s=14, zorder=3)
        last = sub.iloc[-1]
        label_points.append((float(last["year"]), float(last["comparison_percentage"]),
                             _label(topic), colour, is_hi))

    # Cap the y-axis at the next 5% above the data (and bands), as the R plot
    # does, to remove dead headroom.
    if has_band:
        _, band_upper = _wilson(df["n"].to_numpy(), df["reference_n"].to_numpy())
        top = max(float(df["comparison_percentage"].max()), float(band_upper.max()))
    else:
        top = float(df["comparison_percentage"].max())
    ymax = min(100, math.ceil(top / 5) * 5)
    ax.set_ylim(0, ymax)

    # Label the lines directly when they fit legibly; otherwise fall back to a
    # legend. The gap is the minimum vertical separation between labels.
    gap = ymax * 0.055
    if highlight is not None:
        to_label = [p for p in label_points if p[4]]
    elif len(topics) <= 8 and (len(topics) - 1) * gap <= ymax:
        to_label = label_points
    else:
        to_label = []
        ax.legend(fontsize=8, loc="best", frameon=False,
                  ncol=2 if len(topics) > 8 else 1)

    # Place each label at its line's endpoint, with a thin leader. The labels are
    # spread apart at the end (see _decollide_labels), once the final layout is
    # known, so none overlaps another.
    anns, label_xs, label_y_true = [], [], []
    if to_label:
        years = df["year"]
        dx = (float(years.max()) - float(years.min())) * 0.015 + 0.1
        for x, y_true, topic, colour, _is_hi in to_label:
            anns.append(ax.annotate(
                topic, xy=(x, y_true), xytext=(x + dx, y_true), textcoords="data",
                va="center", ha="left", fontsize=8, color=colour,
                annotation_clip=False,
                arrowprops=dict(arrowstyle="-", color=colour, lw=0.6, shrinkA=1, shrinkB=3),
            ))
            label_xs.append(x + dx)
            label_y_true.append(y_true)
    ax.set_xlabel("")
    ax.set_ylabel("Share of reference records")
    ax.yaxis.set_major_formatter(mticker.PercentFormatter(xmax=100, decimals=0))
    ax.set_title("Topic share within a reference literature, over time",
                 loc="left", fontsize=12, pad=22)
    if ref_label:
        ax.annotate(
            f"Each line: % of '{ref_label}' records that also match the topic",
            xy=(0, 1), xycoords="axes fraction", xytext=(0, 6),
            textcoords="offset points", ha="left", va="bottom",
            fontsize=9, color="#555555",
        )
    # A caption that names the source and guards against reading the illustrative
    # Wilson band as an inferential confidence interval (it is not).
    caption = (f"Source: 'Scopus' Search API. Years {int(df['year'].min())} "
               f"to {int(df['year'].max())}.")
    if has_band:
        caption += ("\nShaded band: illustrative Wilson stability range "
                    "(not a confidence interval), wider where the reference set is small.")
    if n_missing > 0:
        plural = "" if n_missing == 1 else "s"
        caption += (f"\n{n_missing} year-topic value{plural} omitted for want "
                    "of reference records.")
    ax.annotate(caption, xy=(0, 0), xycoords="axes fraction", xytext=(0, -24),
                textcoords="offset points", ha="left", va="top",
                fontsize=7.5, color="#737373")
    _clean_axes(ax)

    if anns:
        # Re-spread the labels on every draw, measuring the rendered text height in
        # the layout that actually applies, so they never overlap however the lines
        # converge or the figure is sized. Running on the draw means it stays
        # correct after the caller tightens the layout. A guard stops the redraw it
        # requests from recursing once the positions have settled.
        _busy = {"on": False}

        def _on_draw(_event=None):
            if _busy["on"]:
                return
            _busy["on"] = True
            try:
                if _decollide_once(ax, anns, label_xs, label_y_true, ymax, gap):
                    ax.figure.canvas.draw_idle()
            finally:
                _busy["on"] = False

        ax.figure.canvas.mpl_connect("draw_event", _on_draw)
        # An initial draw so a one-shot render (savefig without a prior draw) still
        # gets de-collided labels.
        try:
            ax.figure.canvas.draw()
        except Exception:
            pass
    return ax

Analyse & plot¶

scopus_trend ¶

year_counts ¶

compare_topics ¶

COMPARISON_COLUMNS module-attribute ¶

plot_trend ¶

plot_top ¶

plot_comparison ¶

COMPARISON_COLUMNS `module-attribute` ¶