Skip to content

Analyse & plot

Summarise a literature over time, compare topics within it, and turn the summaries into figures.

scopus_trend

scopus_trend(query, years, view='STANDARD', **kwargs)

Count Scopus hits for query in each of years without downloading them.

Each year is a cheap result-size lookup, so this gives a publication trend far faster than harvesting every record.

Source code in src/scopusflow/trend.py
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
def scopus_trend(
    query: str,
    years: Sequence[int],
    view: str = "STANDARD",
    **kwargs,
) -> pd.DataFrame:
    """Count Scopus hits for ``query`` in each of ``years`` without downloading them.

    Each year is a cheap result-size lookup, so this gives a publication trend
    far faster than harvesting every record.
    """
    if not query or not query.strip():
        raise ValueError("query must be a non-empty string.")
    years = list(years)
    if not years:
        raise ValueError("years must be a non-empty sequence.")

    from pybliometrics.scopus import ScopusSearch  # imported lazily; needs a key

    counts: dict[int, int] = {}
    for y in years:
        search = ScopusSearch(
            f"{query} AND PUBYEAR IS {y}", view=view, download=False, **kwargs
        )
        counts[int(y)] = int(search.get_results_size())
    return _trend_frame(counts)

year_counts

year_counts(records)

Count records per publication year, dropping rows with a missing year.

Returns a :data:TREND_COLUMNS frame sorted ascending by year, with both columns as plain integers.

Source code in src/scopusflow/trend.py
19
20
21
22
23
24
25
26
27
def year_counts(records: pd.DataFrame) -> pd.DataFrame:
    """Count records per publication year, dropping rows with a missing year.

    Returns a :data:`TREND_COLUMNS` frame sorted ascending by year, with both
    columns as plain integers.
    """
    years = pd.to_numeric(records["year"], errors="coerce").dropna()
    counts = {int(y): int(n) for y, n in years.astype(int).value_counts().items()}
    return _trend_frame(counts)

compare_topics

compare_topics(reference_query, comparison_terms, years, field=None, view='STANDARD', **kwargs)

Compare comparison topics against a reference topic over the years.

Returns a :data:COMPARISON_COLUMNS frame. One count request per term per year, so keep the term and year counts modest to stay within quota.

Source code in src/scopusflow/compare.py
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
def compare_topics(reference_query: str, comparison_terms, years: Sequence[int],
                   field: Optional[str] = None, view: str = "STANDARD",
                   **kwargs) -> pd.DataFrame:
    """Compare comparison topics against a reference topic over the years.

    Returns a :data:`COMPARISON_COLUMNS` frame. One count request per term per
    year, so keep the term and year counts modest to stay within quota.
    """
    if not reference_query or not str(reference_query).strip():
        raise ValueError("reference_query must be a non-empty string.")
    if isinstance(comparison_terms, str):
        comparison_terms = [comparison_terms]
    terms = [str(t).strip() for t in comparison_terms]
    if not terms or any(not t for t in terms):
        raise ValueError("comparison_terms must be a non-empty list of non-empty terms.")
    if years is None or not list(years):
        raise ValueError("years must be a non-empty sequence.")
    ys = []
    for y in years:
        yi = int(y)
        if float(y) != yi or not (1700 <= yi <= 2200):
            raise ValueError("years must be whole numbers between 1700 and 2200.")
        ys.append(yi)
    ys = sorted(set(ys))

    from pybliometrics.scopus import ScopusSearch  # imported lazily; needs a key

    def size(query: str, year: int) -> int:
        full = f"{query} AND PUBYEAR IS {year}"
        return int(ScopusSearch(full, view=view, download=False, **kwargs).get_results_size())

    ref_query = wrap_field(str(reference_query).strip(), field)
    # One count step per term, plus the reference; logged as "Cell k/N:" so the
    # app's progress parser can drive a bar (mirrors the R verbose output).
    total = len(terms) + 1
    logger.info("Cell 1/%d: counting reference across %d year(s)", total, len(ys))
    ref_counts = {y: size(ref_query, y) for y in ys}

    comparison = []
    for i, term in enumerate(terms):
        logger.info("Cell %d/%d: counting '%s'", i + 2, total, term)
        cmp_query = f"{ref_query} AND {wrap_field(term, field)}"
        comparison.append((term, cmp_query, {y: size(cmp_query, y) for y in ys}))

    return _assemble(str(reference_query).strip(), ref_query, ref_counts, comparison, ys)

COMPARISON_COLUMNS module-attribute

COMPARISON_COLUMNS = ['query', 'query_type', 'abridged_query', 'year', 'n', 'reference_n', 'comparison_percentage', 'average_comparison_percentage']

plot_trend

plot_trend(trend, ax=None)

Plot publication counts over time as a filled area, line and points.

trend has columns ["year", "n"]; returns the matplotlib Axes.

Source code in src/scopusflow/plots.py
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
def plot_trend(trend: pd.DataFrame, ax=None):
    """Plot publication counts over time as a filled area, line and points.

    ``trend`` has columns ``["year", "n"]``; returns the matplotlib ``Axes``.
    """
    import matplotlib.pyplot as plt

    if ax is None:
        _, ax = plt.subplots()

    years = trend["year"]
    counts = trend["n"]
    ax.fill_between(years, counts, alpha=0.16, color=_TREND_COLOUR)
    ax.plot(years, counts, color=_TREND_COLOUR, linewidth=2)
    ax.scatter(years, counts, color=_TREND_COLOUR, s=18, zorder=3)
    ax.set_ylim(bottom=0)
    ax.set_xlabel("Year")
    ax.set_ylabel("Records")
    _clean_axes(ax)
    return ax

plot_top

plot_top(top, ax=None)

Plot a horizontal bar chart of the most frequent values, largest on top.

top has columns ["value", "n"] (from :func:scopusflow.records.top); returns the matplotlib Axes.

Source code in src/scopusflow/plots.py
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
def plot_top(top: pd.DataFrame, ax=None):
    """Plot a horizontal bar chart of the most frequent values, largest on top.

    ``top`` has columns ``["value", "n"]`` (from :func:`scopusflow.records.top`);
    returns the matplotlib ``Axes``.
    """
    import matplotlib.pyplot as plt

    if ax is None:
        _, ax = plt.subplots()

    # Reverse so the largest count sits at the top of the chart.
    ordered = top.iloc[::-1]
    ax.barh(ordered["value"].astype(str), ordered["n"], color=_TOP_COLOUR)
    ax.set_xlabel("Records")
    ax.set_ylabel("")
    _clean_axes(ax)
    return ax

plot_comparison

plot_comparison(comparison, highlight=None, interval=True, counts_in_legend=True, ax=None)

Plot each comparison topic's share of the reference literature over time.

comparison is the frame from :func:scopusflow.compare.compare_topics. With interval a shaded Wilson band shows how stable each yearly share is (illustrative, not a confidence interval — Scopus counts are exact). highlight names one topic to draw in an accent colour, the rest in grey. With counts_in_legend (the default) each label carries the topic's total record count, for example machine learning (n = 1,204). Returns the matplotlib Axes.

Source code in src/scopusflow/plots.py
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
def plot_comparison(comparison: pd.DataFrame, highlight=None, interval: bool = True,
                    counts_in_legend: bool = True, ax=None):
    """Plot each comparison topic's share of the reference literature over time.

    ``comparison`` is the frame from :func:`scopusflow.compare.compare_topics`.
    With ``interval`` a shaded Wilson band shows how stable each yearly share is
    (illustrative, not a confidence interval — Scopus counts are exact).
    ``highlight`` names one topic to draw in an accent colour, the rest in grey.
    With ``counts_in_legend`` (the default) each label carries the topic's total
    record count, for example ``machine learning (n = 1,204)``. Returns the
    matplotlib ``Axes``.
    """
    import matplotlib.pyplot as plt
    import matplotlib.ticker as mticker

    required = {"query_type", "abridged_query", "year", "comparison_percentage"}
    if not required.issubset(comparison.columns):
        raise ValueError("comparison must be a topic-comparison frame.")

    comp_all = comparison[comparison["query_type"] == "comparison"]
    # A year whose reference has no records carries no defined share; it is
    # dropped and noted in the caption, mirroring the R plot.
    n_missing = int(comp_all["comparison_percentage"].isna().sum())
    df = comp_all[comp_all["comparison_percentage"].notna()].copy()
    if df.empty:
        raise ValueError("comparison has no comparison topics with a finite share to plot.")

    # The reference topic names the subtitle (it is the 100% denominator).
    ref_rows = comparison[comparison["query_type"] == "reference"]
    ref_names = ref_rows["abridged_query"].dropna().unique() if len(ref_rows) else []
    ref_label = str(ref_names[0]) if len(ref_names) == 1 else None

    order = (df.groupby("abridged_query")["average_comparison_percentage"].first()
             .reset_index()
             .sort_values(["average_comparison_percentage", "abridged_query"],
                          ascending=[False, True]))
    topics = list(order["abridged_query"])
    if highlight is not None and highlight not in topics:
        raise ValueError(f"highlight must be one of: {', '.join(topics)}.")

    # Optionally append each topic's total record count to its label.
    has_counts = counts_in_legend and "n" in df.columns
    totals = df.groupby("abridged_query")["n"].sum() if has_counts else None

    def _label(topic):
        return f"{topic} (n = {int(totals[topic]):,})" if has_counts else topic

    if ax is None:
        _, ax = plt.subplots()
    cmap = plt.get_cmap("viridis")
    spread = max(len(topics) - 1, 1)
    has_band = interval and {"n", "reference_n"}.issubset(df.columns)
    label_points = []

    for i, topic in enumerate(topics):
        sub = df[df["abridged_query"] == topic].sort_values("year")
        is_hi = highlight == topic
        if highlight is not None:
            colour = "#BB5566" if is_hi else "#BFBFBF"
            width = 1.6 if is_hi else 0.8
        else:
            colour = cmap(0.05 + 0.8 * i / spread)
            width = 1.4
        if has_band and (highlight is None or is_hi):
            lo, up = _wilson(sub["n"].to_numpy(), sub["reference_n"].to_numpy())
            ax.fill_between(sub["year"], lo, up, color=colour, alpha=0.16, linewidth=0)
        ax.plot(sub["year"], sub["comparison_percentage"], color=colour,
                linewidth=width, label=_label(topic))
        ax.scatter(sub["year"], sub["comparison_percentage"], color=colour, s=14, zorder=3)
        last = sub.iloc[-1]
        label_points.append((float(last["year"]), float(last["comparison_percentage"]),
                             _label(topic), colour, is_hi))

    # Cap the y-axis at the next 5% above the data (and bands), as the R plot
    # does, to remove dead headroom.
    if has_band:
        _, band_upper = _wilson(df["n"].to_numpy(), df["reference_n"].to_numpy())
        top = max(float(df["comparison_percentage"].max()), float(band_upper.max()))
    else:
        top = float(df["comparison_percentage"].max())
    ymax = min(100, math.ceil(top / 5) * 5)
    ax.set_ylim(0, ymax)

    # Label the lines directly when they fit legibly; otherwise fall back to a
    # legend. The gap is the minimum vertical separation between labels.
    gap = ymax * 0.055
    if highlight is not None:
        to_label = [p for p in label_points if p[4]]
    elif len(topics) <= 8 and (len(topics) - 1) * gap <= ymax:
        to_label = label_points
    else:
        to_label = []
        ax.legend(fontsize=8, loc="best", frameon=False,
                  ncol=2 if len(topics) > 8 else 1)

    # Place each label at its line's endpoint, with a thin leader. The labels are
    # spread apart at the end (see _decollide_labels), once the final layout is
    # known, so none overlaps another.
    anns, label_xs, label_y_true = [], [], []
    if to_label:
        years = df["year"]
        dx = (float(years.max()) - float(years.min())) * 0.015 + 0.1
        for x, y_true, topic, colour, _is_hi in to_label:
            anns.append(ax.annotate(
                topic, xy=(x, y_true), xytext=(x + dx, y_true), textcoords="data",
                va="center", ha="left", fontsize=8, color=colour,
                annotation_clip=False,
                arrowprops=dict(arrowstyle="-", color=colour, lw=0.6, shrinkA=1, shrinkB=3),
            ))
            label_xs.append(x + dx)
            label_y_true.append(y_true)
    ax.set_xlabel("")
    ax.set_ylabel("Share of reference records")
    ax.yaxis.set_major_formatter(mticker.PercentFormatter(xmax=100, decimals=0))
    ax.set_title("Topic share within a reference literature, over time",
                 loc="left", fontsize=12, pad=22)
    if ref_label:
        ax.annotate(
            f"Each line: % of '{ref_label}' records that also match the topic",
            xy=(0, 1), xycoords="axes fraction", xytext=(0, 6),
            textcoords="offset points", ha="left", va="bottom",
            fontsize=9, color="#555555",
        )
    # A caption that names the source and guards against reading the illustrative
    # Wilson band as an inferential confidence interval (it is not).
    caption = (f"Source: 'Scopus' Search API. Years {int(df['year'].min())} "
               f"to {int(df['year'].max())}.")
    if has_band:
        caption += ("\nShaded band: illustrative Wilson stability range "
                    "(not a confidence interval), wider where the reference set is small.")
    if n_missing > 0:
        plural = "" if n_missing == 1 else "s"
        caption += (f"\n{n_missing} year-topic value{plural} omitted for want "
                    "of reference records.")
    ax.annotate(caption, xy=(0, 0), xycoords="axes fraction", xytext=(0, -24),
                textcoords="offset points", ha="left", va="top",
                fontsize=7.5, color="#737373")
    _clean_axes(ax)

    if anns:
        # Re-spread the labels on every draw, measuring the rendered text height in
        # the layout that actually applies, so they never overlap however the lines
        # converge or the figure is sized. Running on the draw means it stays
        # correct after the caller tightens the layout. A guard stops the redraw it
        # requests from recursing once the positions have settled.
        _busy = {"on": False}

        def _on_draw(_event=None):
            if _busy["on"]:
                return
            _busy["on"] = True
            try:
                if _decollide_once(ax, anns, label_xs, label_y_true, ymax, gap):
                    ax.figure.canvas.draw_idle()
            finally:
                _busy["on"] = False

        ax.figure.canvas.mpl_connect("draw_event", _on_draw)
        # An initial draw so a one-shot render (savefig without a prior draw) still
        # gets de-collided labels.
        try:
            ax.figure.canvas.draw()
        except Exception:
            pass
    return ax