Skip to content

DOIs & tracking

Extract clean DOIs and compare two retrievals to see exactly what changed.

extract_dois

extract_dois(records, dedupe=True)

Pull cleaned, optionally de-duplicated DOIs from records or a list.

Source code in src/scopusflow/diff.py
31
32
33
34
35
36
37
38
39
40
41
42
43
def extract_dois(records, dedupe: bool = True) -> list[str]:
    """Pull cleaned, optionally de-duplicated DOIs from records or a list."""
    dois = _clean(_as_dois(records))
    if not dedupe:
        return dois
    seen: set[str] = set()
    result: list[str] = []
    for d in dois:
        key = d.lower()
        if key not in seen:
            seen.add(key)
            result.append(d)
    return result

diff_dois

diff_dois(old, new)

Compare two retrievals; return a frame of (doi, status) where status is added, removed or unchanged (compared case-insensitively).

Source code in src/scopusflow/diff.py
46
47
48
49
50
51
52
53
54
55
56
57
58
def diff_dois(old, new) -> pd.DataFrame:
    """Compare two retrievals; return a frame of (doi, status) where status is
    ``added``, ``removed`` or ``unchanged`` (compared case-insensitively)."""
    old_d, new_d = extract_dois(old), extract_dois(new)
    old_keys = {d.lower() for d in old_d}
    new_keys = {d.lower() for d in new_d}
    rows = (
        [(d, "added") for d in new_d if d.lower() not in old_keys]
        + [(d, "removed") for d in old_d if d.lower() not in new_keys]
        + [(d, "unchanged") for d in new_d if d.lower() in old_keys]
    )
    df = pd.DataFrame(rows, columns=["doi", "status"])
    return df.sort_values(["status", "doi"]).reset_index(drop=True)