Skip to content

Records

Normalise results into one stable schema, tally the most frequent values, and export to reference-manager formats.

to_records

to_records(results, query=None)

Normalise a pybliometrics ScopusSearch().results list (named tuples) or a list of dicts into a tidy :data:RECORD_COLUMNS DataFrame.

Whatever the query type, the columns are the same, so the downstream DOI, diff and analysis helpers can rely on them.

Source code in src/scopusflow/records.py
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
def to_records(results, query: str | None = None) -> pd.DataFrame:
    """Normalise a pybliometrics ``ScopusSearch().results`` list (named tuples)
    or a list of dicts into a tidy :data:`RECORD_COLUMNS` DataFrame.

    Whatever the query type, the columns are the same, so the downstream DOI,
    diff and analysis helpers can rely on them.
    """
    rows = []
    for i, r in enumerate(results or [], start=1):
        eid = _get(r, "eid")
        scopus_id = str(eid).split("2-s2.0-")[-1] if eid else pd.NA
        date = _get(r, "coverDate")
        cited = _get(r, "citedby_count")
        rows.append({
            "entry_number": i,
            "scopus_id": scopus_id,
            "doi": _get(r, "doi"),
            "title": _get(r, "title"),
            # pybliometrics joins multiple authors with ';' in author_names.
            "authors": _get(r, "author_names") or _get(r, "creator"),
            "year": _year(date),
            "date": date,
            "publication": _get(r, "publicationName"),
            "citations": int(cited) if cited not in (None, "") else pd.NA,
            "query": query,
        })
    return pd.DataFrame(rows, columns=RECORD_COLUMNS)

top

top(records, by='source', n=10)

Tally the most frequent sources or authors in a record set.

Source code in src/scopusflow/records.py
56
57
58
59
60
61
62
63
64
65
66
67
68
def top(records: pd.DataFrame, by: str = "source", n: int = 10) -> pd.DataFrame:
    """Tally the most frequent sources or authors in a record set."""
    if by == "source":
        values = records["publication"].dropna()
    elif by == "author":
        values = (
            records["authors"].dropna().str.split(";").explode().str.strip()
        )
        values = values[values != ""]
    else:
        raise ValueError("by must be 'source' or 'author'.")
    counts = values.value_counts().head(n)
    return counts.rename_axis("value").reset_index(name="n")

RECORD_COLUMNS module-attribute

RECORD_COLUMNS = ['entry_number', 'scopus_id', 'doi', 'title', 'authors', 'year', 'date', 'publication', 'citations', 'query']

to_bibtex

to_bibtex(records)

Render records as a BibTeX string, one @article entry per row, with citation keys made unique within the export.

Source code in src/scopusflow/export.py
122
123
124
125
126
127
128
129
130
131
def to_bibtex(records: pd.DataFrame) -> str:
    """Render records as a BibTeX string, one ``@article`` entry per row, with
    citation keys made unique within the export."""
    if not isinstance(records, pd.DataFrame):
        raise ValueError("records must be a pandas DataFrame.")
    rows = [row for _, row in records.iterrows()]
    keys = _disambiguate(
        [_bibtex_key(r.get("authors"), r.get("year"), r.get("scopus_id")) for r in rows]
    )
    return "\n\n".join(_bibtex_entry(r, k) for r, k in zip(rows, keys))

to_ris

to_ris(records)

Render records as an RIS string, one JOUR record per row.

Source code in src/scopusflow/export.py
134
135
136
137
138
def to_ris(records: pd.DataFrame) -> str:
    """Render records as an RIS string, one ``JOUR`` record per row."""
    if not isinstance(records, pd.DataFrame):
        raise ValueError("records must be a pandas DataFrame.")
    return "\n\n".join(_ris_entry(row) for _, row in records.iterrows())