diff --git a/.github/workflows/get_schema_on_tag.yml b/.github/workflows/get_schema_on_tag.yml new file mode 100644 index 0000000..52adc0c --- /dev/null +++ b/.github/workflows/get_schema_on_tag.yml @@ -0,0 +1,42 @@ +name: Run Python Script on Tag + +on: + push: + tags: + - '*' # Trigger on all tags + +permissions: + contents: write + +jobs: + run-script: + runs-on: ubuntu-latest + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.12' + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -r requirements.txt + + - name: Run script with tag + run: | + TAG_NAME=${GITHUB_REF#refs/tags/} + echo "Running script with tag: $TAG_NAME" + python schemas/isaric_schema.py "$TAG_NAME" + + - name: Commit and push changes + run: | + git config --global user.name "github-actions[bot]" + git config --global user.email "github-actions[bot]@users.noreply.github.com" + git add arc_$TAG_NAME_isaric_long_schema.json + git commit -m "Add generated schema for tag $TAG_NAME" || echo "No changes to commit" + git push + env: + TAG_NAME: ${GITHUB_REF#refs/tags/} diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..1055d1b --- /dev/null +++ b/.gitignore @@ -0,0 +1,6 @@ +.venv +.vscode/ +.DS_Store + +__pycache__/ +*.pyc \ No newline at end of file diff --git a/CHANGELOG.md b/CHANGELOG.md index 4d09127..cd1d2bb 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,3 +12,50 @@ ARC v1.1.3 delivers significant updates to support dengue research and harmonisa - Restructured **Dengue ARChetype CRF**, aligning variable definitions with current WHO classification. - Introduced **recommended outcomes for Dengue**, enabling more standardised reporting across studies. +## ARC v1.1.1 (02 Jul 2025) + +### Overview +In ARC v1.1.1, variables in the ONSET & PRESENTATION section were updated to use the prefix pres_ instead of date_ to better reflect their meaning.. + +### Variable‑Level Updates +Prefix change in ONSET & PRESENTATION section: Variables previously using the date_ prefix were renamed to pres_ (e.g., date_adm, → pres_adm). + +### Interoperability +Standardised term code lists have been revised to align with SNOMED‑CT, LOINC, and UMLS. + +## ARC v1.1.0 (09 May 2025) + +### Overview +ARC v1.1.0 delivers a substantial expansion of the data model, introduces a dedicated **Acute Respiratory Infection (ARI)** preset, and separates **signs** from **symptoms** to improve semantic clarity and analytic power. + +### Column / Preset Changes +| Category | Details | +|---|---| +| **New presets / columns (1)** | `preset_ARChetype Syndromic CRF_ARI` **(new)** | +| **Renamed / (4)** | `preset_ARChetype CRF_Covid` → `preset_ARChetype Disease CRF_Covid`
`preset_ARChetype CRF_Dengue` → `preset_ARChetype Disease CRF_Dengue`
`preset_ARChetype CRF_Mpox` → `preset_ARChetype Disease CRF_Mpox`
`preset_ARChetype CRF_H5Nx` → `preset_ARChetype Disease CRF_H5Nx`| + +### Variable‑Level Updates +- **Added variables**: 472 (e.g., `adsym_blurryvis`, `adsym_cough_type`). +- **Removed variables**: 303 (e.g., `adasses_bacsi_oth`, `adasses_lymph`). +- **Field‑type changes**: 84 variables. +- **List updates**: 652 variables. +- **Answer‑choice updates**: 68 variables. +- **Signs vs Symptoms**: Clinical **signs** are now represented by brand‑new `sign_*` variables and no longer stored in `sympt_*`, clearly separating objective observations from patient‑reported symptoms. + +### Interoperability +Standardised term code lists have been revised to align with **SNOMED‑CT**, **LOINC**, and **UMLS**. + +## ARC v1.0.4 (04 Mar 2025) + +### Overview +The changes in ARC v1.0.4 are designed to enhance the system’s usability and ensure a more coherent structure for data entry and analysis. Users are encouraged to review their workflows and adjust any scripts or processes to reflect the updated variable names and group structures. + +### Key Updates + +**Renaming of Presets** + - The preset "disease" has been renamed to "ARChetype CRF" in the principal ARC CSV and in the lists. + +**Updates to Answer Choices** + - The test_biospecimentype variable has been changed to a userlist. + +--- diff --git a/schemas/isaric-core.json b/schemas/isaric-core.json new file mode 100644 index 0000000..5d0056e --- /dev/null +++ b/schemas/isaric-core.json @@ -0,0 +1,113 @@ +{ + "$schema": "http://json-schema.org/draft-07/schema", + "title": "ISARIC Core Wide Schema", + "type": "object", + "properties": { + "subjid": { + "type": "string", + "description": "Patient Identification Number (PIN). May not uniquely identify a patient." + }, + "siteid": { + "type": "string", + "description": "Site ID that collected the data for the patient." + }, + "dataset_id": { + "type": "string", + "description": "Dataset ID that patient belongs to." + }, + "dataset_disease": { + "type": "string", + "description": "Disease or syndrome corresponding to the primary reason for the data collection (same for each patient in the dataset)." + }, + "demog_sex": { + "type": "string", + "enum": [ + "male", + "female", + "other" + ], + "description": "Sex at birth" + }, + "demog_age_days": { + "type": "integer", + "minimum": 0, + "description": "Age in days" + }, + "demog_country_iso3": { + "type": "string", + "pattern": "^[A-Z]{3}$", + "description": "ISO 3166-1 alpha-3 country code" + }, + "pres_adm": { + "type": "string", + "description": "Admitted to hospital", + "enum": [ + "unknown", + "yes", + "no" + ] + }, + "pres_date": { + "type": "string", + "anyOf": [ + { + "format": "date-time" + }, + { + "format": "date" + }, + { + "pattern": "^[0-9]{4}-[0-9]{2}$" + }, + { + "pattern": "^[0-9]{4}$" + } + ], + "description": "Most recent presentation/admission date at this facility" + }, + "outco_outcome": { + "type": "string", + "enum": [ + "discharged_alive", + "still_hospitalised", + "transfer_to_other_facility", + "death", + "palliative_care", + "discharged_against_medical_advice", + "alive_not_admitted" + ], + "description": "Outcome" + }, + "outco_date": { + "type": "string", + "anyOf": [ + { + "format": "date-time" + }, + { + "format": "date" + }, + { + "pattern": "^[0-9]{4}-[0-9]{2}$" + }, + { + "pattern": "^[0-9]{4}$" + } + ], + "description": "Outcome date" + } + }, + "required": [ + "subjid", + "siteid", + "dataset_id", + "dataset_disease", + "demog_sex", + "demog_age_days", + "demog_country_iso3", + "pres_adm", + "pres_date", + "outco_outcome", + "outco_date" + ] +} \ No newline at end of file diff --git a/schemas/isaric_schema.py b/schemas/isaric_schema.py new file mode 100644 index 0000000..b840c27 --- /dev/null +++ b/schemas/isaric_schema.py @@ -0,0 +1,221 @@ +""" +Auto-generates a long schema matching the ISARIC format with the latest ARC variables. + +To be run via a github-action when the ARC version is updated. +""" + +import pandas as pd +import json +import numpy as np +from pathlib import Path +import sys + + +def get_enums(options): + """Extracts the enum values from the 'Answer Options' field.""" + if pd.isna(options): + return [] + return [ + ",".join(c.split(",")[1:]).lstrip(" ").rstrip(" ") for c in options.split("|") + ] + + +def attrs_with_enums(arc, types: list[str]): + rules = [] + arc_filter = arc["Type"].isin(types) + arc_long_with_enums = arc[arc_filter] + + for options, group in arc_long_with_enums.groupby("Answer Options"): + if len(group) == 1: + name = {"const": group.Variable.iloc[0]} + else: + name = {"enum": group.Variable.tolist()} + rule = { + "properties": {"attribute": name}, + "required": ["value"], + } + rule["properties"]["value"] = {"type": "string"} + enums = get_enums(options) + if set(enums) == {"Yes", "No"}: + rule["properties"]["value_bool"] = {"type": "boolean"} + else: + rule["properties"]["value"]["enum"] = enums + + rules.append(rule) + + return rules, arc[~arc_filter] + + +def attrs_with_lists(arc, types: list[str]): + rules = [] + arc_filter = arc["Type"].isin(types) + arc_long_lists = arc[arc_filter] + + for list_file, group in arc_long_lists.groupby("List"): + if len(group) == 1: + name = {"const": group.Variable.iloc[0]} + else: + name = {"enum": group.Variable.tolist()} + rule = { + "properties": {"attribute": name}, + "required": ["value"], + } + file_name = (list_file + ".csv").split("_") + path = Path(*["Lists"] + file_name) + if not path.exists(): + raise FileNotFoundError(f"List file {list_file} does not exist.") + list_enums = [x.strip() for x in pd.read_csv(path).iloc[:, 0].unique().tolist()] + + rule["properties"]["value"] = {"type": "string", "enum": list_enums} + + rules.append(rule) + return rules, arc[~arc_filter] + + +# Currently not used +# def attrs_with_units(arc, types: list[str]): +# rules = [] +# arc_filter = arc["Type"].isin(types) +# vars_with_units = arc[arc_filter]["Variable"] +# arc_vars_to_remove = vars_with_units.copy().to_list() + +# for var in vars_with_units: +# unit_options = arc[arc["Variable"].str.startswith(var + "_")][ +# "Variable" +# ].to_list() +# arc_vars_to_remove += unit_options + +# units = [u.removeprefix(var + "_") for u in unit_options] + +# rule = { +# "properties": { +# "attribute": {"const": var}, +# "attribute_unit": {"enum": units}, +# "value_num": {"type": "number"}, +# }, +# "required": ["value_num", "attribute_unit"], +# } + +# rules.append(rule) + +# return rules, arc[~arc["Variable"].isin(arc_vars_to_remove)] + + +def numeric_attrs(arc, types: list[str]): + rules = [] + arc_filter = arc["Type"].isin(types) + arc_long_numeric = arc[arc_filter] + + for min_max, group in arc_long_numeric.groupby( + ["Minimum", "Maximum"], dropna=False + ): + min, max = min_max + if len(group) == 1: + name = {"const": group.Variable.iloc[0]} + else: + name = {"enum": group.Variable.tolist()} + rule = { + "properties": {"attribute": name}, + "required": ["value_num"], + } + rule["properties"]["value_num"] = {"type": "number"} + if not pd.isna(min): + rule["properties"]["value_num"]["minimum"] = float(min) + if not pd.isna(max): + rule["properties"]["value_num"]["maximum"] = float(max) + + rules.append(rule) + return rules, arc[~arc_filter] + + +def date_attrs(arc, types: list[str]): + rules = [] + arc_filter = arc["Type"].isin(types) + arc_long_dates = arc[arc_filter] + + for input_type, group in arc_long_dates.groupby("Type"): + if len(group) == 1: + name = {"const": group.Variable.iloc[0]} + else: + name = {"enum": group.Variable.tolist()} + rule = { + "properties": {"attribute": name}, + "required": ["value"], + } + if input_type == "date_dmy": + rule["properties"]["value"] = {"type": "string", "format": "date"} + elif input_type == "datetime_dmy": + rule["properties"]["value"] = {"type": "string", "format": "date-time"} + + rules.append(rule) + return rules, arc[~arc_filter] + + +def generic_str_attrs(arc, types: list[str]): + arc_filter = arc["Type"].isin(types) + arc_long_other_str = arc[arc_filter] + + rule = {"properties": {"attribute": {"enum": arc_long_other_str.Variable.tolist()}}} + rule["properties"]["value"] = {"type": "string"} + rule["required"] = ["value"] + + return [rule], arc[~arc_filter] + + +def generate_long_schema(version): + arc = pd.read_csv("ARC.csv") + + with open("schemas/isaric-core.json", "r") as f: + template_core = json.load(f) + + with open("schemas/template-isaric-long.json", "r") as f: + template_long = json.load(f) + + # Drop the core properties from the long schema + # Don't include descriptive, file types or NaN's (unwanted as stored attributes) + arc_long = arc[~arc.Variable.isin(template_core["properties"].keys())] + arc_long = arc_long[~(arc_long.Type.isin(["descriptive", "file", np.nan]))] + + # Generate rules for each type of attribute + enum_rules, arc_no_enums = attrs_with_enums(arc_long, ["radio", "checkbox"]) + + list_rules, arc_no_lists = attrs_with_lists( + arc_no_enums, ["list", "user_list", "multi_list"] + ) + + numeric_rules, arc_no_numbers = numeric_attrs(arc_no_lists, ["number", "calc"]) + + date_rules, arc_no_dates = date_attrs(arc_no_numbers, ["date_dmy", "datetime_dmy"]) + + other_str_rules, arc_no_other_str = generic_str_attrs( + arc_no_dates, ["text", "notes"] + ) + + # Combine all rules into one list + one_of_rules = ( + enum_rules + list_rules + numeric_rules + date_rules + other_str_rules + ) + + # check no types have been missed + if len(arc_no_other_str) > 0: + raise ValueError( + "The following rows were not processed: \n", + arc_no_other_str, + "Please check the ARC.csv file for any new types.", + ) + + template_long["oneOf"] = one_of_rules + + # Generate new long schema + with open(f"schemas/arc_{version}_isaric_long.schema.json", "w") as f: + json.dump(template_long, f, indent=4) + + +def main(): + tag = sys.argv[1] + print(f"Running script with tag: {tag}") + generate_long_schema(tag) + + +if __name__ == "__main__": + main() diff --git a/schemas/requirements.txt b/schemas/requirements.txt new file mode 100644 index 0000000..6d7fa5e --- /dev/null +++ b/schemas/requirements.txt @@ -0,0 +1,2 @@ +numpy==2.3.1 +pandas==2.3.1 diff --git a/schemas/template-isaric-long.json b/schemas/template-isaric-long.json new file mode 100644 index 0000000..6210e5b --- /dev/null +++ b/schemas/template-isaric-long.json @@ -0,0 +1,119 @@ +{ + "$schema": "http://json-schema.org/draft-07/schema#", + "title": "ISARIC Long Schema", + "additionalProperties": false, + "type": "object", + "properties": { + "subjid": { + "type": "string", + "description": "Patient Identification Number (PIN). May not uniquely identify a patient." + }, + "dataset_id": { + "type": "string", + "description": "Study ID that patient belongs to." + }, + "event_id": { + "type": [ + "string", + "null" + ], + "description": "ID code to link different attributes which relate to the same event (e.g. medication being administered) together." + }, + "phase": { + "type": "string", + "enum": [ + "presentation", + "pre_observation", + "during_observation", + "followup", + "outcome" + ], + "description": "Healthcare encounter phase" + }, + "attribute": { + "type": "string", + "description": "Name of the attribute/event being recorded (e.g. temperature, blood pressure, etc.). Where an attribute with the same or substantially similar semantics exist in ARC, that attribute name must be used." + }, + "attribute_unit": { + "type": [ + "string", + "null" + ], + "description": "Unit of the attribute being recorded (e.g. 'mg/dL'). Null if the attribute has no unit." + }, + "arcver": { + "type": [ + "string", + "null" + ], + "description": "ARC version used to generate the CRF. Null if not generated from ARC." + }, + "value": { + "type": [ + "string", + "null" + ], + "description": "String data associated with the attribute." + }, + "value_num": { + "type": [ + "number", + "null" + ], + "description": "Value of numerical measurements, e.g. temperature, BP." + }, + "reldate_adm": { + "type": [ + "integer", + "null" + ], + "description": "Relative day since admission." + }, + "date": { + "type": [ + "string", + "null" + ], + "anyOf": [ + { + "format": "date-time" + }, + { + "format": "date" + }, + { + "pattern": "^[0-9]{4}-[0-9]{2}$" + }, + { + "pattern": "^[0-9]{4}$" + } + ], + "description": "Date of event." + }, + "duration": { + "type": [ + "integer", + "null" + ], + "description": "Duration of the event in days." + } + }, + "oneOf": [ + { + "required": [ + "value" + ] + }, + { + "required": [ + "value_num" + ] + } + ], + "required": [ + "subjid", + "dataset_id", + "phase", + "attribute" + ] +} \ No newline at end of file