diff --git a/.github/workflows/get_schema_on_tag.yml b/.github/workflows/get_schema_on_tag.yml
new file mode 100644
index 0000000..52adc0c
--- /dev/null
+++ b/.github/workflows/get_schema_on_tag.yml
@@ -0,0 +1,42 @@
+name: Run Python Script on Tag
+
+on:
+ push:
+ tags:
+ - '*' # Trigger on all tags
+
+permissions:
+ contents: write
+
+jobs:
+ run-script:
+ runs-on: ubuntu-latest
+ steps:
+ - name: Checkout code
+ uses: actions/checkout@v4
+
+ - name: Set up Python
+ uses: actions/setup-python@v5
+ with:
+ python-version: '3.12'
+
+ - name: Install dependencies
+ run: |
+ python -m pip install --upgrade pip
+ pip install -r requirements.txt
+
+ - name: Run script with tag
+ run: |
+ TAG_NAME=${GITHUB_REF#refs/tags/}
+ echo "Running script with tag: $TAG_NAME"
+ python schemas/isaric_schema.py "$TAG_NAME"
+
+ - name: Commit and push changes
+ run: |
+ git config --global user.name "github-actions[bot]"
+ git config --global user.email "github-actions[bot]@users.noreply.github.com"
+ git add arc_$TAG_NAME_isaric_long_schema.json
+ git commit -m "Add generated schema for tag $TAG_NAME" || echo "No changes to commit"
+ git push
+ env:
+ TAG_NAME: ${GITHUB_REF#refs/tags/}
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..1055d1b
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,6 @@
+.venv
+.vscode/
+.DS_Store
+
+__pycache__/
+*.pyc
\ No newline at end of file
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 4d09127..cd1d2bb 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -12,3 +12,50 @@ ARC v1.1.3 delivers significant updates to support dengue research and harmonisa
- Restructured **Dengue ARChetype CRF**, aligning variable definitions with current WHO classification.
- Introduced **recommended outcomes for Dengue**, enabling more standardised reporting across studies.
+## ARC v1.1.1 (02 Jul 2025)
+
+### Overview
+In ARC v1.1.1, variables in the ONSET & PRESENTATION section were updated to use the prefix pres_ instead of date_ to better reflect their meaning..
+
+### Variable‑Level Updates
+Prefix change in ONSET & PRESENTATION section: Variables previously using the date_ prefix were renamed to pres_ (e.g., date_adm, → pres_adm).
+
+### Interoperability
+Standardised term code lists have been revised to align with SNOMED‑CT, LOINC, and UMLS.
+
+## ARC v1.1.0 (09 May 2025)
+
+### Overview
+ARC v1.1.0 delivers a substantial expansion of the data model, introduces a dedicated **Acute Respiratory Infection (ARI)** preset, and separates **signs** from **symptoms** to improve semantic clarity and analytic power.
+
+### Column / Preset Changes
+| Category | Details |
+|---|---|
+| **New presets / columns (1)** | `preset_ARChetype Syndromic CRF_ARI` **(new)** |
+| **Renamed / (4)** | `preset_ARChetype CRF_Covid` → `preset_ARChetype Disease CRF_Covid`
`preset_ARChetype CRF_Dengue` → `preset_ARChetype Disease CRF_Dengue`
`preset_ARChetype CRF_Mpox` → `preset_ARChetype Disease CRF_Mpox`
`preset_ARChetype CRF_H5Nx` → `preset_ARChetype Disease CRF_H5Nx`|
+
+### Variable‑Level Updates
+- **Added variables**: 472 (e.g., `adsym_blurryvis`, `adsym_cough_type`).
+- **Removed variables**: 303 (e.g., `adasses_bacsi_oth`, `adasses_lymph`).
+- **Field‑type changes**: 84 variables.
+- **List updates**: 652 variables.
+- **Answer‑choice updates**: 68 variables.
+- **Signs vs Symptoms**: Clinical **signs** are now represented by brand‑new `sign_*` variables and no longer stored in `sympt_*`, clearly separating objective observations from patient‑reported symptoms.
+
+### Interoperability
+Standardised term code lists have been revised to align with **SNOMED‑CT**, **LOINC**, and **UMLS**.
+
+## ARC v1.0.4 (04 Mar 2025)
+
+### Overview
+The changes in ARC v1.0.4 are designed to enhance the system’s usability and ensure a more coherent structure for data entry and analysis. Users are encouraged to review their workflows and adjust any scripts or processes to reflect the updated variable names and group structures.
+
+### Key Updates
+
+**Renaming of Presets**
+ - The preset "disease" has been renamed to "ARChetype CRF" in the principal ARC CSV and in the lists.
+
+**Updates to Answer Choices**
+ - The test_biospecimentype variable has been changed to a userlist.
+
+---
diff --git a/schemas/isaric-core.json b/schemas/isaric-core.json
new file mode 100644
index 0000000..5d0056e
--- /dev/null
+++ b/schemas/isaric-core.json
@@ -0,0 +1,113 @@
+{
+ "$schema": "http://json-schema.org/draft-07/schema",
+ "title": "ISARIC Core Wide Schema",
+ "type": "object",
+ "properties": {
+ "subjid": {
+ "type": "string",
+ "description": "Patient Identification Number (PIN). May not uniquely identify a patient."
+ },
+ "siteid": {
+ "type": "string",
+ "description": "Site ID that collected the data for the patient."
+ },
+ "dataset_id": {
+ "type": "string",
+ "description": "Dataset ID that patient belongs to."
+ },
+ "dataset_disease": {
+ "type": "string",
+ "description": "Disease or syndrome corresponding to the primary reason for the data collection (same for each patient in the dataset)."
+ },
+ "demog_sex": {
+ "type": "string",
+ "enum": [
+ "male",
+ "female",
+ "other"
+ ],
+ "description": "Sex at birth"
+ },
+ "demog_age_days": {
+ "type": "integer",
+ "minimum": 0,
+ "description": "Age in days"
+ },
+ "demog_country_iso3": {
+ "type": "string",
+ "pattern": "^[A-Z]{3}$",
+ "description": "ISO 3166-1 alpha-3 country code"
+ },
+ "pres_adm": {
+ "type": "string",
+ "description": "Admitted to hospital",
+ "enum": [
+ "unknown",
+ "yes",
+ "no"
+ ]
+ },
+ "pres_date": {
+ "type": "string",
+ "anyOf": [
+ {
+ "format": "date-time"
+ },
+ {
+ "format": "date"
+ },
+ {
+ "pattern": "^[0-9]{4}-[0-9]{2}$"
+ },
+ {
+ "pattern": "^[0-9]{4}$"
+ }
+ ],
+ "description": "Most recent presentation/admission date at this facility"
+ },
+ "outco_outcome": {
+ "type": "string",
+ "enum": [
+ "discharged_alive",
+ "still_hospitalised",
+ "transfer_to_other_facility",
+ "death",
+ "palliative_care",
+ "discharged_against_medical_advice",
+ "alive_not_admitted"
+ ],
+ "description": "Outcome"
+ },
+ "outco_date": {
+ "type": "string",
+ "anyOf": [
+ {
+ "format": "date-time"
+ },
+ {
+ "format": "date"
+ },
+ {
+ "pattern": "^[0-9]{4}-[0-9]{2}$"
+ },
+ {
+ "pattern": "^[0-9]{4}$"
+ }
+ ],
+ "description": "Outcome date"
+ }
+ },
+ "required": [
+ "subjid",
+ "siteid",
+ "dataset_id",
+ "dataset_disease",
+ "demog_sex",
+ "demog_age_days",
+ "demog_country_iso3",
+ "pres_adm",
+ "pres_date",
+ "outco_outcome",
+ "outco_date"
+ ]
+}
\ No newline at end of file
diff --git a/schemas/isaric_schema.py b/schemas/isaric_schema.py
new file mode 100644
index 0000000..b840c27
--- /dev/null
+++ b/schemas/isaric_schema.py
@@ -0,0 +1,221 @@
+"""
+Auto-generates a long schema matching the ISARIC format with the latest ARC variables.
+
+To be run via a github-action when the ARC version is updated.
+"""
+
+import pandas as pd
+import json
+import numpy as np
+from pathlib import Path
+import sys
+
+
+def get_enums(options):
+ """Extracts the enum values from the 'Answer Options' field."""
+ if pd.isna(options):
+ return []
+ return [
+ ",".join(c.split(",")[1:]).lstrip(" ").rstrip(" ") for c in options.split("|")
+ ]
+
+
+def attrs_with_enums(arc, types: list[str]):
+ rules = []
+ arc_filter = arc["Type"].isin(types)
+ arc_long_with_enums = arc[arc_filter]
+
+ for options, group in arc_long_with_enums.groupby("Answer Options"):
+ if len(group) == 1:
+ name = {"const": group.Variable.iloc[0]}
+ else:
+ name = {"enum": group.Variable.tolist()}
+ rule = {
+ "properties": {"attribute": name},
+ "required": ["value"],
+ }
+ rule["properties"]["value"] = {"type": "string"}
+ enums = get_enums(options)
+ if set(enums) == {"Yes", "No"}:
+ rule["properties"]["value_bool"] = {"type": "boolean"}
+ else:
+ rule["properties"]["value"]["enum"] = enums
+
+ rules.append(rule)
+
+ return rules, arc[~arc_filter]
+
+
+def attrs_with_lists(arc, types: list[str]):
+ rules = []
+ arc_filter = arc["Type"].isin(types)
+ arc_long_lists = arc[arc_filter]
+
+ for list_file, group in arc_long_lists.groupby("List"):
+ if len(group) == 1:
+ name = {"const": group.Variable.iloc[0]}
+ else:
+ name = {"enum": group.Variable.tolist()}
+ rule = {
+ "properties": {"attribute": name},
+ "required": ["value"],
+ }
+ file_name = (list_file + ".csv").split("_")
+ path = Path(*["Lists"] + file_name)
+ if not path.exists():
+ raise FileNotFoundError(f"List file {list_file} does not exist.")
+ list_enums = [x.strip() for x in pd.read_csv(path).iloc[:, 0].unique().tolist()]
+
+ rule["properties"]["value"] = {"type": "string", "enum": list_enums}
+
+ rules.append(rule)
+ return rules, arc[~arc_filter]
+
+
+# Currently not used
+# def attrs_with_units(arc, types: list[str]):
+# rules = []
+# arc_filter = arc["Type"].isin(types)
+# vars_with_units = arc[arc_filter]["Variable"]
+# arc_vars_to_remove = vars_with_units.copy().to_list()
+
+# for var in vars_with_units:
+# unit_options = arc[arc["Variable"].str.startswith(var + "_")][
+# "Variable"
+# ].to_list()
+# arc_vars_to_remove += unit_options
+
+# units = [u.removeprefix(var + "_") for u in unit_options]
+
+# rule = {
+# "properties": {
+# "attribute": {"const": var},
+# "attribute_unit": {"enum": units},
+# "value_num": {"type": "number"},
+# },
+# "required": ["value_num", "attribute_unit"],
+# }
+
+# rules.append(rule)
+
+# return rules, arc[~arc["Variable"].isin(arc_vars_to_remove)]
+
+
+def numeric_attrs(arc, types: list[str]):
+ rules = []
+ arc_filter = arc["Type"].isin(types)
+ arc_long_numeric = arc[arc_filter]
+
+ for min_max, group in arc_long_numeric.groupby(
+ ["Minimum", "Maximum"], dropna=False
+ ):
+ min, max = min_max
+ if len(group) == 1:
+ name = {"const": group.Variable.iloc[0]}
+ else:
+ name = {"enum": group.Variable.tolist()}
+ rule = {
+ "properties": {"attribute": name},
+ "required": ["value_num"],
+ }
+ rule["properties"]["value_num"] = {"type": "number"}
+ if not pd.isna(min):
+ rule["properties"]["value_num"]["minimum"] = float(min)
+ if not pd.isna(max):
+ rule["properties"]["value_num"]["maximum"] = float(max)
+
+ rules.append(rule)
+ return rules, arc[~arc_filter]
+
+
+def date_attrs(arc, types: list[str]):
+ rules = []
+ arc_filter = arc["Type"].isin(types)
+ arc_long_dates = arc[arc_filter]
+
+ for input_type, group in arc_long_dates.groupby("Type"):
+ if len(group) == 1:
+ name = {"const": group.Variable.iloc[0]}
+ else:
+ name = {"enum": group.Variable.tolist()}
+ rule = {
+ "properties": {"attribute": name},
+ "required": ["value"],
+ }
+ if input_type == "date_dmy":
+ rule["properties"]["value"] = {"type": "string", "format": "date"}
+ elif input_type == "datetime_dmy":
+ rule["properties"]["value"] = {"type": "string", "format": "date-time"}
+
+ rules.append(rule)
+ return rules, arc[~arc_filter]
+
+
+def generic_str_attrs(arc, types: list[str]):
+ arc_filter = arc["Type"].isin(types)
+ arc_long_other_str = arc[arc_filter]
+
+ rule = {"properties": {"attribute": {"enum": arc_long_other_str.Variable.tolist()}}}
+ rule["properties"]["value"] = {"type": "string"}
+ rule["required"] = ["value"]
+
+ return [rule], arc[~arc_filter]
+
+
+def generate_long_schema(version):
+ arc = pd.read_csv("ARC.csv")
+
+ with open("schemas/isaric-core.json", "r") as f:
+ template_core = json.load(f)
+
+ with open("schemas/template-isaric-long.json", "r") as f:
+ template_long = json.load(f)
+
+ # Drop the core properties from the long schema
+ # Don't include descriptive, file types or NaN's (unwanted as stored attributes)
+ arc_long = arc[~arc.Variable.isin(template_core["properties"].keys())]
+ arc_long = arc_long[~(arc_long.Type.isin(["descriptive", "file", np.nan]))]
+
+ # Generate rules for each type of attribute
+ enum_rules, arc_no_enums = attrs_with_enums(arc_long, ["radio", "checkbox"])
+
+ list_rules, arc_no_lists = attrs_with_lists(
+ arc_no_enums, ["list", "user_list", "multi_list"]
+ )
+
+ numeric_rules, arc_no_numbers = numeric_attrs(arc_no_lists, ["number", "calc"])
+
+ date_rules, arc_no_dates = date_attrs(arc_no_numbers, ["date_dmy", "datetime_dmy"])
+
+ other_str_rules, arc_no_other_str = generic_str_attrs(
+ arc_no_dates, ["text", "notes"]
+ )
+
+ # Combine all rules into one list
+ one_of_rules = (
+ enum_rules + list_rules + numeric_rules + date_rules + other_str_rules
+ )
+
+ # check no types have been missed
+ if len(arc_no_other_str) > 0:
+ raise ValueError(
+ "The following rows were not processed: \n",
+ arc_no_other_str,
+ "Please check the ARC.csv file for any new types.",
+ )
+
+ template_long["oneOf"] = one_of_rules
+
+ # Generate new long schema
+ with open(f"schemas/arc_{version}_isaric_long.schema.json", "w") as f:
+ json.dump(template_long, f, indent=4)
+
+
+def main():
+ tag = sys.argv[1]
+ print(f"Running script with tag: {tag}")
+ generate_long_schema(tag)
+
+
+if __name__ == "__main__":
+ main()
diff --git a/schemas/requirements.txt b/schemas/requirements.txt
new file mode 100644
index 0000000..6d7fa5e
--- /dev/null
+++ b/schemas/requirements.txt
@@ -0,0 +1,2 @@
+numpy==2.3.1
+pandas==2.3.1
diff --git a/schemas/template-isaric-long.json b/schemas/template-isaric-long.json
new file mode 100644
index 0000000..6210e5b
--- /dev/null
+++ b/schemas/template-isaric-long.json
@@ -0,0 +1,119 @@
+{
+ "$schema": "http://json-schema.org/draft-07/schema#",
+ "title": "ISARIC Long Schema",
+ "additionalProperties": false,
+ "type": "object",
+ "properties": {
+ "subjid": {
+ "type": "string",
+ "description": "Patient Identification Number (PIN). May not uniquely identify a patient."
+ },
+ "dataset_id": {
+ "type": "string",
+ "description": "Study ID that patient belongs to."
+ },
+ "event_id": {
+ "type": [
+ "string",
+ "null"
+ ],
+ "description": "ID code to link different attributes which relate to the same event (e.g. medication being administered) together."
+ },
+ "phase": {
+ "type": "string",
+ "enum": [
+ "presentation",
+ "pre_observation",
+ "during_observation",
+ "followup",
+ "outcome"
+ ],
+ "description": "Healthcare encounter phase"
+ },
+ "attribute": {
+ "type": "string",
+ "description": "Name of the attribute/event being recorded (e.g. temperature, blood pressure, etc.). Where an attribute with the same or substantially similar semantics exist in ARC, that attribute name must be used."
+ },
+ "attribute_unit": {
+ "type": [
+ "string",
+ "null"
+ ],
+ "description": "Unit of the attribute being recorded (e.g. 'mg/dL'). Null if the attribute has no unit."
+ },
+ "arcver": {
+ "type": [
+ "string",
+ "null"
+ ],
+ "description": "ARC version used to generate the CRF. Null if not generated from ARC."
+ },
+ "value": {
+ "type": [
+ "string",
+ "null"
+ ],
+ "description": "String data associated with the attribute."
+ },
+ "value_num": {
+ "type": [
+ "number",
+ "null"
+ ],
+ "description": "Value of numerical measurements, e.g. temperature, BP."
+ },
+ "reldate_adm": {
+ "type": [
+ "integer",
+ "null"
+ ],
+ "description": "Relative day since admission."
+ },
+ "date": {
+ "type": [
+ "string",
+ "null"
+ ],
+ "anyOf": [
+ {
+ "format": "date-time"
+ },
+ {
+ "format": "date"
+ },
+ {
+ "pattern": "^[0-9]{4}-[0-9]{2}$"
+ },
+ {
+ "pattern": "^[0-9]{4}$"
+ }
+ ],
+ "description": "Date of event."
+ },
+ "duration": {
+ "type": [
+ "integer",
+ "null"
+ ],
+ "description": "Duration of the event in days."
+ }
+ },
+ "oneOf": [
+ {
+ "required": [
+ "value"
+ ]
+ },
+ {
+ "required": [
+ "value_num"
+ ]
+ }
+ ],
+ "required": [
+ "subjid",
+ "dataset_id",
+ "phase",
+ "attribute"
+ ]
+}
\ No newline at end of file