Records¶

Normalise results into one stable schema, tally the most frequent values, and export to reference-manager formats.

to_records ¶

to_records(results, query=None)

Normalise a pybliometrics ScopusSearch().results list (named tuples) or a list of dicts into a tidy :data:RECORD_COLUMNS DataFrame.

Whatever the query type, the columns are the same, so the downstream DOI, diff and analysis helpers can rely on them.

Source code in src/scopusflow/records.py

def to_records(results, query: str | None = None) -> pd.DataFrame:
    """Normalise a pybliometrics ``ScopusSearch().results`` list (named tuples)
    or a list of dicts into a tidy :data:`RECORD_COLUMNS` DataFrame.

    Whatever the query type, the columns are the same, so the downstream DOI,
    diff and analysis helpers can rely on them.
    """
    rows = []
    for i, r in enumerate(results or [], start=1):
        eid = _get(r, "eid")
        scopus_id = str(eid).split("2-s2.0-")[-1] if eid else pd.NA
        date = _get(r, "coverDate")
        cited = _get(r, "citedby_count")
        rows.append({
            "entry_number": i,
            "scopus_id": scopus_id,
            "doi": _get(r, "doi"),
            "title": _get(r, "title"),
            # pybliometrics joins multiple authors with ';' in author_names.
            "authors": _get(r, "author_names") or _get(r, "creator"),
            "year": _year(date),
            "date": date,
            "publication": _get(r, "publicationName"),
            "citations": int(cited) if cited not in (None, "") else pd.NA,
            "query": query,
        })
    return pd.DataFrame(rows, columns=RECORD_COLUMNS)

top ¶

top(records, by='source', n=10)

Tally the most frequent sources or authors in a record set.

Source code in src/scopusflow/records.py

def top(records: pd.DataFrame, by: str = "source", n: int = 10) -> pd.DataFrame:
    """Tally the most frequent sources or authors in a record set."""
    if by == "source":
        values = records["publication"].dropna()
    elif by == "author":
        values = (
            records["authors"].dropna().str.split(";").explode().str.strip()
        )
        values = values[values != ""]
    else:
        raise ValueError("by must be 'source' or 'author'.")
    counts = values.value_counts().head(n)
    return counts.rename_axis("value").reset_index(name="n")

RECORD_COLUMNS `module-attribute` ¶

RECORD_COLUMNS = ['entry_number', 'scopus_id', 'doi', 'title', 'authors', 'year', 'date', 'publication', 'citations', 'query']

to_bibtex ¶

to_bibtex(records)

Render records as a BibTeX string, one @article entry per row, with citation keys made unique within the export.

Source code in src/scopusflow/export.py

def to_bibtex(records: pd.DataFrame) -> str:
    """Render records as a BibTeX string, one ``@article`` entry per row, with
    citation keys made unique within the export."""
    if not isinstance(records, pd.DataFrame):
        raise ValueError("records must be a pandas DataFrame.")
    rows = [row for _, row in records.iterrows()]
    keys = _disambiguate(
        [_bibtex_key(r.get("authors"), r.get("year"), r.get("scopus_id")) for r in rows]
    )
    return "\n\n".join(_bibtex_entry(r, k) for r, k in zip(rows, keys))

to_ris ¶

to_ris(records)

Render records as an RIS string, one JOUR record per row.