From 470adaf170074053bb21f293eb8c7a7f500691ab Mon Sep 17 00:00:00 2001 From: Pip Liggins Date: Mon, 21 Jul 2025 14:44:53 +0100 Subject: [PATCH 01/13] Adds a workflow to auto-generate an isaric-long schema populated with the current ARC variables --- .github/workflows/get_schema_on_tag.yml | 42 ++++++ .gitignore | 1 + schemas/isaric-core.json | 82 +++++++++++ schemas/isaric_schema.py | 180 ++++++++++++++++++++++++ schemas/requirements.txt | 2 + schemas/template-isaric-long.json | 122 ++++++++++++++++ 6 files changed, 429 insertions(+) create mode 100644 .github/workflows/get_schema_on_tag.yml create mode 100644 .gitignore create mode 100644 schemas/isaric-core.json create mode 100644 schemas/isaric_schema.py create mode 100644 schemas/requirements.txt create mode 100644 schemas/template-isaric-long.json diff --git a/.github/workflows/get_schema_on_tag.yml b/.github/workflows/get_schema_on_tag.yml new file mode 100644 index 0000000..52adc0c --- /dev/null +++ b/.github/workflows/get_schema_on_tag.yml @@ -0,0 +1,42 @@ +name: Run Python Script on Tag + +on: + push: + tags: + - '*' # Trigger on all tags + +permissions: + contents: write + +jobs: + run-script: + runs-on: ubuntu-latest + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.12' + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -r requirements.txt + + - name: Run script with tag + run: | + TAG_NAME=${GITHUB_REF#refs/tags/} + echo "Running script with tag: $TAG_NAME" + python schemas/isaric_schema.py "$TAG_NAME" + + - name: Commit and push changes + run: | + git config --global user.name "github-actions[bot]" + git config --global user.email "github-actions[bot]@users.noreply.github.com" + git add arc_$TAG_NAME_isaric_long_schema.json + git commit -m "Add generated schema for tag $TAG_NAME" || echo "No changes to commit" + git push + env: + TAG_NAME: ${GITHUB_REF#refs/tags/} diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..b694934 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +.venv \ No newline at end of file diff --git a/schemas/isaric-core.json b/schemas/isaric-core.json new file mode 100644 index 0000000..78e4b92 --- /dev/null +++ b/schemas/isaric-core.json @@ -0,0 +1,82 @@ +{ + "$schema": "http://json-schema.org/draft-07/schema#", + "title": "ISARIC Core Wide Schema", + "type": "object", + "properties": { + "subjid": { + "type": "string", + "description": "Patient Identification Number (PIN). May not uniquely identify a patient." + }, + "siteid": { + "type": "string", + "description": "Site ID that collected the data for the patient." + }, + "dataset_id": { + "type": "string", + "description": "Dataset ID that patient belongs to." + }, + "dataset_disease": { + "type": "string", + "description": "Disease or syndrome corresponding to the primary reason for the data collection (same for each patient in the dataset)." + }, + "demog_sex": { + "type": "string", + "enum": [ + "male", + "female", + "other" + ], + "description": "Sex at birth" + }, + "demog_age_days": { + "type": "integer", + "minimum": 0, + "description": "Age in days" + }, + "demog_country_iso3": { + "type": "string", + "pattern": "^[A-Z]{3}$", + "description": "ISO 3166-1 alpha-3 country code" + }, + "pres_adm": { + "type": "boolean", + "description": "Admitted to hospital" + }, + "pres_date": { + "type": "string", + "format": "datetime", + "description": "Most recent presentation/admission date at this facility" + }, + "outco_outcome": { + "type": "string", + "enum": [ + "discharged_alive", + "still_hospitalised", + "transfer_to_other_facility", + "death", + "palliative_care", + "discharged_against_medical_advice", + "alive_not_admitted" + ], + "description": "Outcome" + }, + "outco_date": { + "type": "string", + "format": "datetime", + "description": "Outcome date" + } + }, + "required": [ + "subjid", + "siteid", + "dataset_id", + "dataset_disease", + "demog_sex", + "demog_age_days", + "demog_country_iso3", + "pres_adm", + "pres_date", + "outco_outcome", + "outco_date" + ] +} \ No newline at end of file diff --git a/schemas/isaric_schema.py b/schemas/isaric_schema.py new file mode 100644 index 0000000..9e3f371 --- /dev/null +++ b/schemas/isaric_schema.py @@ -0,0 +1,180 @@ +""" +Auto-generates a long schema matching the ISARIC format with the latest ARC variables. + +To be run via a github-action when the ARC version is updated. +""" + +import pandas as pd +import json +import numpy as np +from pathlib import Path +import sys + + +def get_enums(options): + """Extracts the enum values from the 'Answer Options' field.""" + if pd.isna(options): + return [] + return [ + ",".join(c.split(",")[1:]).lstrip(" ").rstrip(" ") for c in options.split("|") + ] + + +def attrs_with_enums(arc, types: list[str], all_types: list[str]): + rules = [] + arc_long_with_enums = arc[arc["Type"].isin(types)] + + for options, group in arc_long_with_enums.groupby("Answer Options"): + rule = { + "properties": {"attribute": {"enum": group.Variable.tolist()}}, + "required": ["value"], + } + rule["properties"]["value"] = {"type": "string"} + enums = get_enums(options) + if set(enums) == {"Yes", "No"}: + rule["properties"]["value_bool"] = {"type": "boolean"} + else: + rule["properties"]["value"]["enum"] = enums + + rules.append(rule) + # drop from the list of all types + all_types = [t for t in all_types if t not in types] + return rules, all_types + + +def attrs_with_lists(arc, types: list[str], all_types: list[str]): + rules = [] + arc_long_lists = arc[arc["Type"].isin(types)] + + for list_file, group in arc_long_lists.groupby("List"): + rule = { + "properties": {"attribute": {"const": group.Variable.tolist()}}, + "required": ["value"], + } + file_name = (list_file + ".csv").split("_") + path = Path(*["Lists"] + file_name) + if not path.exists(): + raise FileNotFoundError(f"List file {list_file} does not exist.") + list_enums = pd.read_csv(path).iloc[:, 0].unique().tolist() + + rule["properties"]["value"] = {"type": "string", "enum": list_enums} + + rules.append(rule) + all_types = [t for t in all_types if t not in types] + return rules, all_types + + +def numeric_attrs(arc, types: list[str], all_types: list[str]): + rules = [] + arc_long_numeric = arc[arc["Type"].isin(types)] + + for min_max, group in arc_long_numeric.groupby(["Minimum", "Maximum"]): + min, max = min_max + rule = { + "properties": {"attribute": {"const": group.Variable.tolist()}}, + "required": ["value_num", "attribute_unit"], + } + rule["properties"]["value_num"] = { + "type": "number", + "minimum": float(min), + "maximum": float(max), + } + + rules.append(rule) + all_types = [t for t in all_types if t not in types] + return rules, all_types + + +def date_attrs(arc, types: list[str], all_types: list[str]): + rules = [] + arc_long_dates = arc[arc["Type"].isin(types)] + + for input_type, group in arc_long_dates.groupby("Type"): + rule = { + "properties": {"attribute": {"const": group.Variable.tolist()}}, + "required": ["value"], + } + if input_type == "date_dmy": + rule["properties"]["value"] = {"type": "string", "format": "date"} + elif input_type == "datetime_dmy": + rule["properties"]["value"] = {"type": "string", "format": "date-time"} + + rules.append(rule) + all_types = [t for t in all_types if t not in types] + return rules, all_types + + +def generic_str_attrs(arc, types: list[str], all_types: list[str]): + arc_long_other_str = arc[arc["Type"].isin(types)] + + rule = { + "properties": {"attribute": {"const": arc_long_other_str.Variable.tolist()}} + } + rule["properties"]["value"] = {"type": "string"} + rule["required"] = ["value"] + + all_types = [t for t in all_types if t not in types] + return [rule], all_types + + +def generate_long_schema(version): + arc = pd.read_csv("ARC.csv") + + with open("schemas/isaric-core.json", "r") as f: + template_core = json.load(f) + + with open("schemas/template-isaric-long.json", "r") as f: + template_long = json.load(f) + + # Drop the core properties from the long schema + # Don't include descriptive, file, or NaN types (unwanted as stored attributes) + arc_long = arc[~arc.Variable.isin(template_core["properties"].keys())] + arc_long = arc_long[~(arc_long.Type.isin(["descriptive", np.nan, "file"]))] + + # Get all the response types from ARC + all_types = arc_long.Type.unique().tolist() + + # Generate rules for each type of attribute + enum_rules, all_types = attrs_with_enums(arc_long, ["radio", "checkbox"], all_types) + + list_rules, all_types = attrs_with_lists( + arc_long, ["list", "user_list", "multi_list"], all_types + ) + + numeric_rules, all_types = numeric_attrs(arc_long, ["number", "calc"], all_types) + + date_rules, all_types = date_attrs( + arc_long, ["date_dmy", "datetime_dmy"], all_types + ) + + other_str_rules, all_types = generic_str_attrs( + arc_long, ["text", "notes"], all_types + ) + + # Combine all rules into one list + one_of_rules = ( + enum_rules + list_rules + numeric_rules + date_rules + other_str_rules + ) + + # check no types have been missed + if len(all_types) > 0: + raise ValueError( + f"The following types were not processed: {', '.join(all_types)}. " + "Please check the ARC.csv file for any new types." + ) + + template_long["oneOf"] = one_of_rules + + # Generate new long schema + with open(f"schemas/arc_{version}_isaric_long_schema.json", "w") as f: + json.dump(template_long, f, indent=4) + + +def main(): + tag = sys.argv[1] + print(f"Running script with tag: {tag}") + generate_long_schema(tag) + + +if __name__ == "__main__": + main() diff --git a/schemas/requirements.txt b/schemas/requirements.txt new file mode 100644 index 0000000..6d7fa5e --- /dev/null +++ b/schemas/requirements.txt @@ -0,0 +1,2 @@ +numpy==2.3.1 +pandas==2.3.1 diff --git a/schemas/template-isaric-long.json b/schemas/template-isaric-long.json new file mode 100644 index 0000000..f459d49 --- /dev/null +++ b/schemas/template-isaric-long.json @@ -0,0 +1,122 @@ +{ + "$schema": "http://json-schema.org/draft-07/schema#", + "title": "ISARIC Long Schema", + "additionalProperties": false, + "type": "object", + "properties": { + "subjid": { + "type": "string", + "description": "Patient Identification Number (PIN). May not uniquely identify a patient." + }, + "dataset_id": { + "type": "string", + "description": "Study ID that patient belongs to." + }, + "phase": { + "type": "string", + "enum": [ + "presentation", + "pre_observation", + "during_observation", + "followup", + "outcome" + ], + "description": "Healthcare encounter phase" + }, + "attribute": { + "type": "string", + "description": "Name of the attribute/event being recorded (e.g. temperature, blood pressure, etc.). Where an attribute with the same or substantially similar semantics exist in ARC, that attribute name must be used." + }, + "attribute_unit": { + "type": [ + "string", + "null" + ], + "description": "Unit of the attribute being recorded (e.g. 'mg/dL'). Null if the attribute has no unit." + }, + "arcver": { + "type": [ + "string", + "null" + ], + "description": "ARC version used to generate the CRF. Null if not generated from ARC." + }, + "value": { + "type": [ + "string", + "null" + ], + "description": "String data associated with the attribute." + }, + "value_bool": { + "type": [ + "boolean", + "null" + ], + "description": "Use for Yes/No attributes." + }, + "value_num": { + "type": [ + "number", + "null" + ], + "description": "Value of numerical measurements, e.g. temperature, BP." + }, + "reldate_adm": { + "type": [ + "integer", + "null" + ], + "description": "Relative day since admission." + }, + "reldate_adm_end": { + "type": [ + "integer", + "null" + ], + "description": "Relative day since admission, end of period starting from reldate_adm. Should be Null if event spans <= 1 day." + }, + "date": { + "type": [ + "string", + "null" + ], + "format": "datetime", + "description": "Date of event." + }, + "duration": { + "type": [ + "integer", + "null" + ], + "description": "Duration of the event in days." + } + }, + "oneOf": [ + { + "required": [ + "value" + ] + }, + { + "required": [ + "value_bool" + ] + }, + { + "required": [ + "value_num" + ] + } + ], + "required": [ + "subjid", + "dataset_id", + "phase", + "attribute", + "arcver", + "reldate_adm", + "date", + "duration" + ] +} \ No newline at end of file From a1c785510b0ef9cde0ebaee19e34b1672959afab Mon Sep 17 00:00:00 2001 From: Pip Liggins Date: Mon, 21 Jul 2025 15:48:21 +0100 Subject: [PATCH 02/13] fix schema 'const' being given a list of options --- schemas/isaric_schema.py | 28 +++++++++++++++++++++------- 1 file changed, 21 insertions(+), 7 deletions(-) diff --git a/schemas/isaric_schema.py b/schemas/isaric_schema.py index 9e3f371..d66fa37 100644 --- a/schemas/isaric_schema.py +++ b/schemas/isaric_schema.py @@ -25,8 +25,12 @@ def attrs_with_enums(arc, types: list[str], all_types: list[str]): arc_long_with_enums = arc[arc["Type"].isin(types)] for options, group in arc_long_with_enums.groupby("Answer Options"): + if len(group) == 1: + name = {"const": group.Variable.iloc[0]} + else: + name = {"enum": group.Variable.tolist()} rule = { - "properties": {"attribute": {"enum": group.Variable.tolist()}}, + "properties": {"attribute": name}, "required": ["value"], } rule["properties"]["value"] = {"type": "string"} @@ -47,8 +51,12 @@ def attrs_with_lists(arc, types: list[str], all_types: list[str]): arc_long_lists = arc[arc["Type"].isin(types)] for list_file, group in arc_long_lists.groupby("List"): + if len(group) == 1: + name = {"const": group.Variable.iloc[0]} + else: + name = {"enum": group.Variable.tolist()} rule = { - "properties": {"attribute": {"const": group.Variable.tolist()}}, + "properties": {"attribute": name}, "required": ["value"], } file_name = (list_file + ".csv").split("_") @@ -70,8 +78,12 @@ def numeric_attrs(arc, types: list[str], all_types: list[str]): for min_max, group in arc_long_numeric.groupby(["Minimum", "Maximum"]): min, max = min_max + if len(group) == 1: + name = {"const": group.Variable.iloc[0]} + else: + name = {"enum": group.Variable.tolist()} rule = { - "properties": {"attribute": {"const": group.Variable.tolist()}}, + "properties": {"attribute": name}, "required": ["value_num", "attribute_unit"], } rule["properties"]["value_num"] = { @@ -90,8 +102,12 @@ def date_attrs(arc, types: list[str], all_types: list[str]): arc_long_dates = arc[arc["Type"].isin(types)] for input_type, group in arc_long_dates.groupby("Type"): + if len(group) == 1: + name = {"const": group.Variable.iloc[0]} + else: + name = {"enum": group.Variable.tolist()} rule = { - "properties": {"attribute": {"const": group.Variable.tolist()}}, + "properties": {"attribute": name}, "required": ["value"], } if input_type == "date_dmy": @@ -107,9 +123,7 @@ def date_attrs(arc, types: list[str], all_types: list[str]): def generic_str_attrs(arc, types: list[str], all_types: list[str]): arc_long_other_str = arc[arc["Type"].isin(types)] - rule = { - "properties": {"attribute": {"const": arc_long_other_str.Variable.tolist()}} - } + rule = {"properties": {"attribute": {"enum": arc_long_other_str.Variable.tolist()}}} rule["properties"]["value"] = {"type": "string"} rule["required"] = ["value"] From 0c21cbccc41a05dacebc568b6c5d26369f356536 Mon Sep 17 00:00:00 2001 From: Pip Liggins Date: Tue, 29 Jul 2025 16:19:21 +0100 Subject: [PATCH 03/13] Stop numerical variables being dropped when min/max is null --- schemas/isaric_schema.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/schemas/isaric_schema.py b/schemas/isaric_schema.py index d66fa37..a99bffb 100644 --- a/schemas/isaric_schema.py +++ b/schemas/isaric_schema.py @@ -76,7 +76,9 @@ def numeric_attrs(arc, types: list[str], all_types: list[str]): rules = [] arc_long_numeric = arc[arc["Type"].isin(types)] - for min_max, group in arc_long_numeric.groupby(["Minimum", "Maximum"]): + for min_max, group in arc_long_numeric.groupby( + ["Minimum", "Maximum"], dropna=False + ): min, max = min_max if len(group) == 1: name = {"const": group.Variable.iloc[0]} @@ -86,11 +88,11 @@ def numeric_attrs(arc, types: list[str], all_types: list[str]): "properties": {"attribute": name}, "required": ["value_num", "attribute_unit"], } - rule["properties"]["value_num"] = { - "type": "number", - "minimum": float(min), - "maximum": float(max), - } + rule["properties"]["value_num"] = {"type": "number"} + if not pd.isna(min): + rule["properties"]["value_num"]["minimum"] = float(min) + if not pd.isna(max): + rule["properties"]["value_num"]["maximum"] = float(max) rules.append(rule) all_types = [t for t in all_types if t not in types] From f627daab65e567e5e3db5674fed1f92631506861 Mon Sep 17 00:00:00 2001 From: Pip Liggins Date: Thu, 7 Aug 2025 13:27:49 +0100 Subject: [PATCH 04/13] edit schemas --- schemas/isaric-core.json | 41 +++++++++++++++++++++++++++---- schemas/isaric_schema.py | 2 +- schemas/template-isaric-long.json | 21 +++++++++++----- 3 files changed, 52 insertions(+), 12 deletions(-) diff --git a/schemas/isaric-core.json b/schemas/isaric-core.json index 78e4b92..5d0056e 100644 --- a/schemas/isaric-core.json +++ b/schemas/isaric-core.json @@ -1,5 +1,5 @@ { - "$schema": "http://json-schema.org/draft-07/schema#", + "$schema": "http://json-schema.org/draft-07/schema", "title": "ISARIC Core Wide Schema", "type": "object", "properties": { @@ -39,12 +39,30 @@ "description": "ISO 3166-1 alpha-3 country code" }, "pres_adm": { - "type": "boolean", - "description": "Admitted to hospital" + "type": "string", + "description": "Admitted to hospital", + "enum": [ + "unknown", + "yes", + "no" + ] }, "pres_date": { "type": "string", - "format": "datetime", + "anyOf": [ + { + "format": "date-time" + }, + { + "format": "date" + }, + { + "pattern": "^[0-9]{4}-[0-9]{2}$" + }, + { + "pattern": "^[0-9]{4}$" + } + ], "description": "Most recent presentation/admission date at this facility" }, "outco_outcome": { @@ -62,7 +80,20 @@ }, "outco_date": { "type": "string", - "format": "datetime", + "anyOf": [ + { + "format": "date-time" + }, + { + "format": "date" + }, + { + "pattern": "^[0-9]{4}-[0-9]{2}$" + }, + { + "pattern": "^[0-9]{4}$" + } + ], "description": "Outcome date" } }, diff --git a/schemas/isaric_schema.py b/schemas/isaric_schema.py index a99bffb..b494e54 100644 --- a/schemas/isaric_schema.py +++ b/schemas/isaric_schema.py @@ -86,7 +86,7 @@ def numeric_attrs(arc, types: list[str], all_types: list[str]): name = {"enum": group.Variable.tolist()} rule = { "properties": {"attribute": name}, - "required": ["value_num", "attribute_unit"], + "required": ["value_num"], } rule["properties"]["value_num"] = {"type": "number"} if not pd.isna(min): diff --git a/schemas/template-isaric-long.json b/schemas/template-isaric-long.json index f459d49..a23141e 100644 --- a/schemas/template-isaric-long.json +++ b/schemas/template-isaric-long.json @@ -81,7 +81,20 @@ "string", "null" ], - "format": "datetime", + "anyOf": [ + { + "format": "date-time" + }, + { + "format": "date" + }, + { + "pattern": "^[0-9]{4}-[0-9]{2}$" + }, + { + "pattern": "^[0-9]{4}$" + } + ], "description": "Date of event." }, "duration": { @@ -113,10 +126,6 @@ "subjid", "dataset_id", "phase", - "attribute", - "arcver", - "reldate_adm", - "date", - "duration" + "attribute" ] } \ No newline at end of file From 199b8e70f17e9d239ad22df696932cf068555260 Mon Sep 17 00:00:00 2001 From: Pip Liggins Date: Fri, 8 Aug 2025 14:58:14 +0100 Subject: [PATCH 05/13] strip trailing/leading spaces from variables stored in separate files --- schemas/isaric_schema.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/schemas/isaric_schema.py b/schemas/isaric_schema.py index b494e54..d4a6c63 100644 --- a/schemas/isaric_schema.py +++ b/schemas/isaric_schema.py @@ -63,7 +63,7 @@ def attrs_with_lists(arc, types: list[str], all_types: list[str]): path = Path(*["Lists"] + file_name) if not path.exists(): raise FileNotFoundError(f"List file {list_file} does not exist.") - list_enums = pd.read_csv(path).iloc[:, 0].unique().tolist() + list_enums = [x.strip() for x in pd.read_csv(path).iloc[:, 0].unique().tolist()] rule["properties"]["value"] = {"type": "string", "enum": list_enums} From d6e70a9d376d4a4d82bff1806c60823b0aefb62e Mon Sep 17 00:00:00 2001 From: Pip Liggins Date: Fri, 8 Aug 2025 15:03:40 +0100 Subject: [PATCH 06/13] Change new schema name --- schemas/isaric_schema.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/schemas/isaric_schema.py b/schemas/isaric_schema.py index d4a6c63..dbdb89f 100644 --- a/schemas/isaric_schema.py +++ b/schemas/isaric_schema.py @@ -182,7 +182,7 @@ def generate_long_schema(version): template_long["oneOf"] = one_of_rules # Generate new long schema - with open(f"schemas/arc_{version}_isaric_long_schema.json", "w") as f: + with open(f"schemas/arc_{version}_isaric_long.schema.json", "w") as f: json.dump(template_long, f, indent=4) From 758b213cb562c53ccfc4394ff848cfb8353c937b Mon Sep 17 00:00:00 2001 From: Pip Liggins Date: Tue, 9 Sep 2025 13:02:59 -0700 Subject: [PATCH 07/13] Add some history to changelog.md file --- CHANGELOG.md | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4d09127..201fbfa 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,3 +12,42 @@ ARC v1.1.3 delivers significant updates to support dengue research and harmonisa - Restructured **Dengue ARChetype CRF**, aligning variable definitions with current WHO classification. - Introduced **recommended outcomes for Dengue**, enabling more standardised reporting across studies. +### Interoperability +Standardised term code lists have been revised to align with **SNOMED‑CT**, **LOINC**, and **UMLS**. + +## ARC v1.1.0 (09 May 2025) + +### Overview +ARC v1.1.0 delivers a substantial expansion of the data model, introduces a dedicated **Acute Respiratory Infection (ARI)** preset, and separates **signs** from **symptoms** to improve semantic clarity and analytic power. + +### Column / Preset Changes +| Category | Details | +|---|---| +| **New presets / columns (1)** | `preset_ARChetype Syndromic CRF_ARI` **(new)** | +| **Renamed / (4)** | `preset_ARChetype CRF_Covid` → `preset_ARChetype Disease CRF_Covid`
`preset_ARChetype CRF_Dengue` → `preset_ARChetype Disease CRF_Dengue`
`preset_ARChetype CRF_Mpox` → `preset_ARChetype Disease CRF_Mpox`
`preset_ARChetype CRF_H5Nx` → `preset_ARChetype Disease CRF_H5Nx`| + +### Variable‑Level Updates +- **Added variables**: 472 (e.g., `adsym_blurryvis`, `adsym_cough_type`). +- **Removed variables**: 303 (e.g., `adasses_bacsi_oth`, `adasses_lymph`). +- **Field‑type changes**: 84 variables. +- **List updates**: 652 variables. +- **Answer‑choice updates**: 68 variables. +- **Signs vs Symptoms**: Clinical **signs** are now represented by brand‑new `sign_*` variables and no longer stored in `sympt_*`, clearly separating objective observations from patient‑reported symptoms. + +### Interoperability +Standardised term code lists have been revised to align with **SNOMED‑CT**, **LOINC**, and **UMLS**. + +## ARC v1.0.4 (04 Mar 2025) + +### Overview +The changes in ARC v1.0.4 are designed to enhance the system’s usability and ensure a more coherent structure for data entry and analysis. Users are encouraged to review their workflows and adjust any scripts or processes to reflect the updated variable names and group structures. + +### Key Updates + +**Renaming of Presets** + - The preset "disease" has been renamed to "ARChetype CRF" in the principal ARC CSV and in the lists. + +**Updates to Answer Choices** + - The test_biospecimentype variable has been changed to a userlist. + +--- From e5d0fc813923fd2f7ea4cc1785a75c5caef5b3f9 Mon Sep 17 00:00:00 2001 From: Pip Liggins Date: Tue, 9 Sep 2025 13:47:48 -0700 Subject: [PATCH 08/13] Get units into schema with static variable name --- schemas/isaric_schema.py | 39 ++++++++++++++++++++++++++++++++++++--- 1 file changed, 36 insertions(+), 3 deletions(-) diff --git a/schemas/isaric_schema.py b/schemas/isaric_schema.py index dbdb89f..f33f923 100644 --- a/schemas/isaric_schema.py +++ b/schemas/isaric_schema.py @@ -72,6 +72,32 @@ def attrs_with_lists(arc, types: list[str], all_types: list[str]): return rules, all_types +def attrs_with_units(arc, types: list[str], all_types: list[str]): + rules = [] + vars_with_units = arc[arc["Type"].isin(types)]["Variable"] + + for var in vars_with_units: + unit_options = arc[arc["Variable"].str.startswith(var + "_")][ + "Variable" + ].to_list() + + units = [u.removeprefix(var + "_") for u in unit_options] + + rule = { + "properties": { + "attribute": {"const": var}, + "attribute_unit": {"enum": units}, + "value_num": {"type": "number"}, + }, + "required": ["value_num", "attribute_unit"], + } + + rules.append(rule) + + all_types = [t for t in all_types if t not in types] + return rules, all_types + + def numeric_attrs(arc, types: list[str], all_types: list[str]): rules = [] arc_long_numeric = arc[arc["Type"].isin(types)] @@ -143,9 +169,9 @@ def generate_long_schema(version): template_long = json.load(f) # Drop the core properties from the long schema - # Don't include descriptive, file, or NaN types (unwanted as stored attributes) + # Don't include descriptive or file types (unwanted as stored attributes) arc_long = arc[~arc.Variable.isin(template_core["properties"].keys())] - arc_long = arc_long[~(arc_long.Type.isin(["descriptive", np.nan, "file"]))] + arc_long = arc_long[~(arc_long.Type.isin(["descriptive", "file"]))] # Get all the response types from ARC all_types = arc_long.Type.unique().tolist() @@ -157,6 +183,8 @@ def generate_long_schema(version): arc_long, ["list", "user_list", "multi_list"], all_types ) + unit_rules, all_types = attrs_with_units(arc_long, [np.nan], all_types) + numeric_rules, all_types = numeric_attrs(arc_long, ["number", "calc"], all_types) date_rules, all_types = date_attrs( @@ -169,7 +197,12 @@ def generate_long_schema(version): # Combine all rules into one list one_of_rules = ( - enum_rules + list_rules + numeric_rules + date_rules + other_str_rules + enum_rules + + list_rules + + unit_rules + + numeric_rules + + date_rules + + other_str_rules ) # check no types have been missed From 3bf260b49017ec4833008129146638ad3373c45b Mon Sep 17 00:00:00 2001 From: Pip Liggins Date: Tue, 9 Sep 2025 14:18:27 -0700 Subject: [PATCH 09/13] Pass ARC with remaining unmapped rows to each function Instead of a list of different input types --- schemas/isaric_schema.py | 80 ++++++++++++++++++++-------------------- 1 file changed, 39 insertions(+), 41 deletions(-) diff --git a/schemas/isaric_schema.py b/schemas/isaric_schema.py index f33f923..82de248 100644 --- a/schemas/isaric_schema.py +++ b/schemas/isaric_schema.py @@ -20,9 +20,10 @@ def get_enums(options): ] -def attrs_with_enums(arc, types: list[str], all_types: list[str]): +def attrs_with_enums(arc, types: list[str]): rules = [] - arc_long_with_enums = arc[arc["Type"].isin(types)] + arc_filter = arc["Type"].isin(types) + arc_long_with_enums = arc[arc_filter] for options, group in arc_long_with_enums.groupby("Answer Options"): if len(group) == 1: @@ -41,14 +42,14 @@ def attrs_with_enums(arc, types: list[str], all_types: list[str]): rule["properties"]["value"]["enum"] = enums rules.append(rule) - # drop from the list of all types - all_types = [t for t in all_types if t not in types] - return rules, all_types + return rules, arc[~arc_filter] -def attrs_with_lists(arc, types: list[str], all_types: list[str]): + +def attrs_with_lists(arc, types: list[str]): rules = [] - arc_long_lists = arc[arc["Type"].isin(types)] + arc_filter = arc["Type"].isin(types) + arc_long_lists = arc[arc_filter] for list_file, group in arc_long_lists.groupby("List"): if len(group) == 1: @@ -68,18 +69,20 @@ def attrs_with_lists(arc, types: list[str], all_types: list[str]): rule["properties"]["value"] = {"type": "string", "enum": list_enums} rules.append(rule) - all_types = [t for t in all_types if t not in types] - return rules, all_types + return rules, arc[~arc_filter] -def attrs_with_units(arc, types: list[str], all_types: list[str]): +def attrs_with_units(arc, types: list[str]): rules = [] - vars_with_units = arc[arc["Type"].isin(types)]["Variable"] + arc_filter = arc["Type"].isin(types) + vars_with_units = arc[arc_filter]["Variable"] + arc_vars_to_remove = vars_with_units.copy().to_list() for var in vars_with_units: unit_options = arc[arc["Variable"].str.startswith(var + "_")][ "Variable" ].to_list() + arc_vars_to_remove += unit_options units = [u.removeprefix(var + "_") for u in unit_options] @@ -94,13 +97,13 @@ def attrs_with_units(arc, types: list[str], all_types: list[str]): rules.append(rule) - all_types = [t for t in all_types if t not in types] - return rules, all_types + return rules, arc[~arc["Variable"].isin(arc_vars_to_remove)] -def numeric_attrs(arc, types: list[str], all_types: list[str]): +def numeric_attrs(arc, types: list[str]): rules = [] - arc_long_numeric = arc[arc["Type"].isin(types)] + arc_filter = arc["Type"].isin(types) + arc_long_numeric = arc[arc_filter] for min_max, group in arc_long_numeric.groupby( ["Minimum", "Maximum"], dropna=False @@ -121,13 +124,13 @@ def numeric_attrs(arc, types: list[str], all_types: list[str]): rule["properties"]["value_num"]["maximum"] = float(max) rules.append(rule) - all_types = [t for t in all_types if t not in types] - return rules, all_types + return rules, arc[~arc_filter] -def date_attrs(arc, types: list[str], all_types: list[str]): +def date_attrs(arc, types: list[str]): rules = [] - arc_long_dates = arc[arc["Type"].isin(types)] + arc_filter = arc["Type"].isin(types) + arc_long_dates = arc[arc_filter] for input_type, group in arc_long_dates.groupby("Type"): if len(group) == 1: @@ -144,19 +147,18 @@ def date_attrs(arc, types: list[str], all_types: list[str]): rule["properties"]["value"] = {"type": "string", "format": "date-time"} rules.append(rule) - all_types = [t for t in all_types if t not in types] - return rules, all_types + return rules, arc[~arc_filter] -def generic_str_attrs(arc, types: list[str], all_types: list[str]): - arc_long_other_str = arc[arc["Type"].isin(types)] +def generic_str_attrs(arc, types: list[str]): + arc_filter = arc["Type"].isin(types) + arc_long_other_str = arc[arc_filter] rule = {"properties": {"attribute": {"enum": arc_long_other_str.Variable.tolist()}}} rule["properties"]["value"] = {"type": "string"} rule["required"] = ["value"] - all_types = [t for t in all_types if t not in types] - return [rule], all_types + return [rule], arc[~arc_filter] def generate_long_schema(version): @@ -173,26 +175,21 @@ def generate_long_schema(version): arc_long = arc[~arc.Variable.isin(template_core["properties"].keys())] arc_long = arc_long[~(arc_long.Type.isin(["descriptive", "file"]))] - # Get all the response types from ARC - all_types = arc_long.Type.unique().tolist() - # Generate rules for each type of attribute - enum_rules, all_types = attrs_with_enums(arc_long, ["radio", "checkbox"], all_types) + enum_rules, arc_no_enums = attrs_with_enums(arc_long, ["radio", "checkbox"]) - list_rules, all_types = attrs_with_lists( - arc_long, ["list", "user_list", "multi_list"], all_types + list_rules, arc_no_lists = attrs_with_lists( + arc_no_enums, ["list", "user_list", "multi_list"] ) - unit_rules, all_types = attrs_with_units(arc_long, [np.nan], all_types) + unit_rules, arc_no_units = attrs_with_units(arc_no_lists, [np.nan]) - numeric_rules, all_types = numeric_attrs(arc_long, ["number", "calc"], all_types) + numeric_rules, arc_no_numbers = numeric_attrs(arc_no_units, ["number", "calc"]) - date_rules, all_types = date_attrs( - arc_long, ["date_dmy", "datetime_dmy"], all_types - ) + date_rules, arc_no_dates = date_attrs(arc_no_numbers, ["date_dmy", "datetime_dmy"]) - other_str_rules, all_types = generic_str_attrs( - arc_long, ["text", "notes"], all_types + other_str_rules, arc_no_other_str = generic_str_attrs( + arc_no_dates, ["text", "notes"] ) # Combine all rules into one list @@ -206,10 +203,11 @@ def generate_long_schema(version): ) # check no types have been missed - if len(all_types) > 0: + if len(arc_no_other_str) > 0: raise ValueError( - f"The following types were not processed: {', '.join(all_types)}. " - "Please check the ARC.csv file for any new types." + "The following rows were not processed: \n", + arc_no_other_str, + "Please check the ARC.csv file for any new types.", ) template_long["oneOf"] = one_of_rules From a856838c227dca6572673e76ee7b4779ab6ee6a5 Mon Sep 17 00:00:00 2001 From: Pip Liggins Date: Wed, 22 Oct 2025 15:51:34 +0100 Subject: [PATCH 10/13] Update long ISARIC schema --- schemas/template-isaric-long.json | 26 +++++++------------------- 1 file changed, 7 insertions(+), 19 deletions(-) diff --git a/schemas/template-isaric-long.json b/schemas/template-isaric-long.json index a23141e..6210e5b 100644 --- a/schemas/template-isaric-long.json +++ b/schemas/template-isaric-long.json @@ -12,6 +12,13 @@ "type": "string", "description": "Study ID that patient belongs to." }, + "event_id": { + "type": [ + "string", + "null" + ], + "description": "ID code to link different attributes which relate to the same event (e.g. medication being administered) together." + }, "phase": { "type": "string", "enum": [ @@ -48,13 +55,6 @@ ], "description": "String data associated with the attribute." }, - "value_bool": { - "type": [ - "boolean", - "null" - ], - "description": "Use for Yes/No attributes." - }, "value_num": { "type": [ "number", @@ -69,13 +69,6 @@ ], "description": "Relative day since admission." }, - "reldate_adm_end": { - "type": [ - "integer", - "null" - ], - "description": "Relative day since admission, end of period starting from reldate_adm. Should be Null if event spans <= 1 day." - }, "date": { "type": [ "string", @@ -111,11 +104,6 @@ "value" ] }, - { - "required": [ - "value_bool" - ] - }, { "required": [ "value_num" From 335148fffacb7e299844f01d39c70c3006a2e23f Mon Sep 17 00:00:00 2001 From: Pip Liggins Date: Wed, 22 Oct 2025 15:53:26 +0100 Subject: [PATCH 11/13] Update .gitignore --- .gitignore | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index b694934..1055d1b 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,6 @@ -.venv \ No newline at end of file +.venv +.vscode/ +.DS_Store + +__pycache__/ +*.pyc \ No newline at end of file From 0965b5e849e834a542637e383bbc30f039636704 Mon Sep 17 00:00:00 2001 From: Pip Liggins Date: Wed, 22 Oct 2025 16:40:20 +0100 Subject: [PATCH 12/13] Put units back in names to match arc --- schemas/isaric_schema.py | 58 ++++++++++++++++++---------------------- 1 file changed, 26 insertions(+), 32 deletions(-) diff --git a/schemas/isaric_schema.py b/schemas/isaric_schema.py index 82de248..b840c27 100644 --- a/schemas/isaric_schema.py +++ b/schemas/isaric_schema.py @@ -72,32 +72,33 @@ def attrs_with_lists(arc, types: list[str]): return rules, arc[~arc_filter] -def attrs_with_units(arc, types: list[str]): - rules = [] - arc_filter = arc["Type"].isin(types) - vars_with_units = arc[arc_filter]["Variable"] - arc_vars_to_remove = vars_with_units.copy().to_list() +# Currently not used +# def attrs_with_units(arc, types: list[str]): +# rules = [] +# arc_filter = arc["Type"].isin(types) +# vars_with_units = arc[arc_filter]["Variable"] +# arc_vars_to_remove = vars_with_units.copy().to_list() - for var in vars_with_units: - unit_options = arc[arc["Variable"].str.startswith(var + "_")][ - "Variable" - ].to_list() - arc_vars_to_remove += unit_options +# for var in vars_with_units: +# unit_options = arc[arc["Variable"].str.startswith(var + "_")][ +# "Variable" +# ].to_list() +# arc_vars_to_remove += unit_options - units = [u.removeprefix(var + "_") for u in unit_options] +# units = [u.removeprefix(var + "_") for u in unit_options] - rule = { - "properties": { - "attribute": {"const": var}, - "attribute_unit": {"enum": units}, - "value_num": {"type": "number"}, - }, - "required": ["value_num", "attribute_unit"], - } +# rule = { +# "properties": { +# "attribute": {"const": var}, +# "attribute_unit": {"enum": units}, +# "value_num": {"type": "number"}, +# }, +# "required": ["value_num", "attribute_unit"], +# } - rules.append(rule) +# rules.append(rule) - return rules, arc[~arc["Variable"].isin(arc_vars_to_remove)] +# return rules, arc[~arc["Variable"].isin(arc_vars_to_remove)] def numeric_attrs(arc, types: list[str]): @@ -171,9 +172,9 @@ def generate_long_schema(version): template_long = json.load(f) # Drop the core properties from the long schema - # Don't include descriptive or file types (unwanted as stored attributes) + # Don't include descriptive, file types or NaN's (unwanted as stored attributes) arc_long = arc[~arc.Variable.isin(template_core["properties"].keys())] - arc_long = arc_long[~(arc_long.Type.isin(["descriptive", "file"]))] + arc_long = arc_long[~(arc_long.Type.isin(["descriptive", "file", np.nan]))] # Generate rules for each type of attribute enum_rules, arc_no_enums = attrs_with_enums(arc_long, ["radio", "checkbox"]) @@ -182,9 +183,7 @@ def generate_long_schema(version): arc_no_enums, ["list", "user_list", "multi_list"] ) - unit_rules, arc_no_units = attrs_with_units(arc_no_lists, [np.nan]) - - numeric_rules, arc_no_numbers = numeric_attrs(arc_no_units, ["number", "calc"]) + numeric_rules, arc_no_numbers = numeric_attrs(arc_no_lists, ["number", "calc"]) date_rules, arc_no_dates = date_attrs(arc_no_numbers, ["date_dmy", "datetime_dmy"]) @@ -194,12 +193,7 @@ def generate_long_schema(version): # Combine all rules into one list one_of_rules = ( - enum_rules - + list_rules - + unit_rules - + numeric_rules - + date_rules - + other_str_rules + enum_rules + list_rules + numeric_rules + date_rules + other_str_rules ) # check no types have been missed From afef71b3b90065a4494de571ea90bca7f48ec3a6 Mon Sep 17 00:00:00 2001 From: Pip Liggins Date: Thu, 23 Oct 2025 10:13:38 +0100 Subject: [PATCH 13/13] Edit to add V1.1.1 back into changelog --- CHANGELOG.md | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 201fbfa..cd1d2bb 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,8 +12,16 @@ ARC v1.1.3 delivers significant updates to support dengue research and harmonisa - Restructured **Dengue ARChetype CRF**, aligning variable definitions with current WHO classification. - Introduced **recommended outcomes for Dengue**, enabling more standardised reporting across studies. +## ARC v1.1.1 (02 Jul 2025) + +### Overview +In ARC v1.1.1, variables in the ONSET & PRESENTATION section were updated to use the prefix pres_ instead of date_ to better reflect their meaning.. + +### Variable‑Level Updates +Prefix change in ONSET & PRESENTATION section: Variables previously using the date_ prefix were renamed to pres_ (e.g., date_adm, → pres_adm). + ### Interoperability -Standardised term code lists have been revised to align with **SNOMED‑CT**, **LOINC**, and **UMLS**. +Standardised term code lists have been revised to align with SNOMED‑CT, LOINC, and UMLS. ## ARC v1.1.0 (09 May 2025)