From 470adaf170074053bb21f293eb8c7a7f500691ab Mon Sep 17 00:00:00 2001
From: Pip Liggins <philippa.liggins@dtc.ox.ac.uk>
Date: Mon, 21 Jul 2025 14:44:53 +0100
Subject: [PATCH 01/13] Adds a workflow to auto-generate an isaric-long schema
 populated with the current ARC variables

---
 .github/workflows/get_schema_on_tag.yml |  42 ++++++
 .gitignore                              |   1 +
 schemas/isaric-core.json                |  82 +++++++++++
 schemas/isaric_schema.py                | 180 ++++++++++++++++++++++++
 schemas/requirements.txt                |   2 +
 schemas/template-isaric-long.json       | 122 ++++++++++++++++
 6 files changed, 429 insertions(+)
 create mode 100644 .github/workflows/get_schema_on_tag.yml
 create mode 100644 .gitignore
 create mode 100644 schemas/isaric-core.json
 create mode 100644 schemas/isaric_schema.py
 create mode 100644 schemas/requirements.txt
 create mode 100644 schemas/template-isaric-long.json

diff --git a/.github/workflows/get_schema_on_tag.yml b/.github/workflows/get_schema_on_tag.yml
new file mode 100644
index 0000000..52adc0c
--- /dev/null
+++ b/.github/workflows/get_schema_on_tag.yml
@@ -0,0 +1,42 @@
+name: Run Python Script on Tag
+
+on:
+  push:
+    tags:
+      - '*'  # Trigger on all tags
+
+permissions:
+  contents: write
+
+jobs:
+  run-script:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.12'
+
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install -r requirements.txt
+
+      - name: Run script with tag
+        run: |
+          TAG_NAME=${GITHUB_REF#refs/tags/}
+          echo "Running script with tag: $TAG_NAME"
+          python schemas/isaric_schema.py "$TAG_NAME"
+
+      - name: Commit and push changes
+        run: |
+            git config --global user.name "github-actions[bot]"
+            git config --global user.email "github-actions[bot]@users.noreply.github.com"
+            git add arc_$TAG_NAME_isaric_long_schema.json
+            git commit -m "Add generated schema for tag $TAG_NAME" || echo "No changes to commit"
+            git push
+        env:
+            TAG_NAME: ${GITHUB_REF#refs/tags/}
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..b694934
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1 @@
+.venv
\ No newline at end of file
diff --git a/schemas/isaric-core.json b/schemas/isaric-core.json
new file mode 100644
index 0000000..78e4b92
--- /dev/null
+++ b/schemas/isaric-core.json
@@ -0,0 +1,82 @@
+{
+    "$schema": "http://json-schema.org/draft-07/schema#",
+    "title": "ISARIC Core Wide Schema",
+    "type": "object",
+    "properties": {
+        "subjid": {
+            "type": "string",
+            "description": "Patient Identification Number (PIN). May not uniquely identify a patient."
+        },
+        "siteid": {
+            "type": "string",
+            "description": "Site ID that collected the data for the patient."
+        },
+        "dataset_id": {
+            "type": "string",
+            "description": "Dataset ID that patient belongs to."
+        },
+        "dataset_disease": {
+            "type": "string",
+            "description": "Disease or syndrome corresponding to the primary reason for the data collection (same for each patient in the dataset)."
+        },
+        "demog_sex": {
+            "type": "string",
+            "enum": [
+                "male",
+                "female",
+                "other"
+            ],
+            "description": "Sex at birth"
+        },
+        "demog_age_days": {
+            "type": "integer",
+            "minimum": 0,
+            "description": "Age in days"
+        },
+        "demog_country_iso3": {
+            "type": "string",
+            "pattern": "^[A-Z]{3}$",
+            "description": "ISO 3166-1 alpha-3 country code"
+        },
+        "pres_adm": {
+            "type": "boolean",
+            "description": "Admitted to hospital"
+        },
+        "pres_date": {
+            "type": "string",
+            "format": "datetime",
+            "description": "Most recent presentation/admission date at this facility"
+        },
+        "outco_outcome": {
+            "type": "string",
+            "enum": [
+                "discharged_alive",
+                "still_hospitalised",
+                "transfer_to_other_facility",
+                "death",
+                "palliative_care",
+                "discharged_against_medical_advice",
+                "alive_not_admitted"
+            ],
+            "description": "Outcome"
+        },
+        "outco_date": {
+            "type": "string",
+            "format": "datetime",
+            "description": "Outcome date"
+        }
+    },
+    "required": [
+        "subjid",
+        "siteid",
+        "dataset_id",
+        "dataset_disease",
+        "demog_sex",
+        "demog_age_days",
+        "demog_country_iso3",
+        "pres_adm",
+        "pres_date",
+        "outco_outcome",
+        "outco_date"
+    ]
+}
\ No newline at end of file
diff --git a/schemas/isaric_schema.py b/schemas/isaric_schema.py
new file mode 100644
index 0000000..9e3f371
--- /dev/null
+++ b/schemas/isaric_schema.py
@@ -0,0 +1,180 @@
+"""
+Auto-generates a long schema matching the ISARIC format with the latest ARC variables.
+
+To be run via a github-action when the ARC version is updated.
+"""
+
+import pandas as pd
+import json
+import numpy as np
+from pathlib import Path
+import sys
+
+
+def get_enums(options):
+    """Extracts the enum values from the 'Answer Options' field."""
+    if pd.isna(options):
+        return []
+    return [
+        ",".join(c.split(",")[1:]).lstrip(" ").rstrip(" ") for c in options.split("|")
+    ]
+
+
+def attrs_with_enums(arc, types: list[str], all_types: list[str]):
+    rules = []
+    arc_long_with_enums = arc[arc["Type"].isin(types)]
+
+    for options, group in arc_long_with_enums.groupby("Answer Options"):
+        rule = {
+            "properties": {"attribute": {"enum": group.Variable.tolist()}},
+            "required": ["value"],
+        }
+        rule["properties"]["value"] = {"type": "string"}
+        enums = get_enums(options)
+        if set(enums) == {"Yes", "No"}:
+            rule["properties"]["value_bool"] = {"type": "boolean"}
+        else:
+            rule["properties"]["value"]["enum"] = enums
+
+        rules.append(rule)
+    # drop from the list of all types
+    all_types = [t for t in all_types if t not in types]
+    return rules, all_types
+
+
+def attrs_with_lists(arc, types: list[str], all_types: list[str]):
+    rules = []
+    arc_long_lists = arc[arc["Type"].isin(types)]
+
+    for list_file, group in arc_long_lists.groupby("List"):
+        rule = {
+            "properties": {"attribute": {"const": group.Variable.tolist()}},
+            "required": ["value"],
+        }
+        file_name = (list_file + ".csv").split("_")
+        path = Path(*["Lists"] + file_name)
+        if not path.exists():
+            raise FileNotFoundError(f"List file {list_file} does not exist.")
+        list_enums = pd.read_csv(path).iloc[:, 0].unique().tolist()
+
+        rule["properties"]["value"] = {"type": "string", "enum": list_enums}
+
+        rules.append(rule)
+    all_types = [t for t in all_types if t not in types]
+    return rules, all_types
+
+
+def numeric_attrs(arc, types: list[str], all_types: list[str]):
+    rules = []
+    arc_long_numeric = arc[arc["Type"].isin(types)]
+
+    for min_max, group in arc_long_numeric.groupby(["Minimum", "Maximum"]):
+        min, max = min_max
+        rule = {
+            "properties": {"attribute": {"const": group.Variable.tolist()}},
+            "required": ["value_num", "attribute_unit"],
+        }
+        rule["properties"]["value_num"] = {
+            "type": "number",
+            "minimum": float(min),
+            "maximum": float(max),
+        }
+
+        rules.append(rule)
+    all_types = [t for t in all_types if t not in types]
+    return rules, all_types
+
+
+def date_attrs(arc, types: list[str], all_types: list[str]):
+    rules = []
+    arc_long_dates = arc[arc["Type"].isin(types)]
+
+    for input_type, group in arc_long_dates.groupby("Type"):
+        rule = {
+            "properties": {"attribute": {"const": group.Variable.tolist()}},
+            "required": ["value"],
+        }
+        if input_type == "date_dmy":
+            rule["properties"]["value"] = {"type": "string", "format": "date"}
+        elif input_type == "datetime_dmy":
+            rule["properties"]["value"] = {"type": "string", "format": "date-time"}
+
+        rules.append(rule)
+    all_types = [t for t in all_types if t not in types]
+    return rules, all_types
+
+
+def generic_str_attrs(arc, types: list[str], all_types: list[str]):
+    arc_long_other_str = arc[arc["Type"].isin(types)]
+
+    rule = {
+        "properties": {"attribute": {"const": arc_long_other_str.Variable.tolist()}}
+    }
+    rule["properties"]["value"] = {"type": "string"}
+    rule["required"] = ["value"]
+
+    all_types = [t for t in all_types if t not in types]
+    return [rule], all_types
+
+
+def generate_long_schema(version):
+    arc = pd.read_csv("ARC.csv")
+
+    with open("schemas/isaric-core.json", "r") as f:
+        template_core = json.load(f)
+
+    with open("schemas/template-isaric-long.json", "r") as f:
+        template_long = json.load(f)
+
+    # Drop the core properties from the long schema
+    # Don't include descriptive, file, or NaN types (unwanted as stored attributes)
+    arc_long = arc[~arc.Variable.isin(template_core["properties"].keys())]
+    arc_long = arc_long[~(arc_long.Type.isin(["descriptive", np.nan, "file"]))]
+
+    # Get all the response types from ARC
+    all_types = arc_long.Type.unique().tolist()
+
+    # Generate rules for each type of attribute
+    enum_rules, all_types = attrs_with_enums(arc_long, ["radio", "checkbox"], all_types)
+
+    list_rules, all_types = attrs_with_lists(
+        arc_long, ["list", "user_list", "multi_list"], all_types
+    )
+
+    numeric_rules, all_types = numeric_attrs(arc_long, ["number", "calc"], all_types)
+
+    date_rules, all_types = date_attrs(
+        arc_long, ["date_dmy", "datetime_dmy"], all_types
+    )
+
+    other_str_rules, all_types = generic_str_attrs(
+        arc_long, ["text", "notes"], all_types
+    )
+
+    # Combine all rules into one list
+    one_of_rules = (
+        enum_rules + list_rules + numeric_rules + date_rules + other_str_rules
+    )
+
+    # check no types have been missed
+    if len(all_types) > 0:
+        raise ValueError(
+            f"The following types were not processed: {', '.join(all_types)}. "
+            "Please check the ARC.csv file for any new types."
+        )
+
+    template_long["oneOf"] = one_of_rules
+
+    # Generate new long schema
+    with open(f"schemas/arc_{version}_isaric_long_schema.json", "w") as f:
+        json.dump(template_long, f, indent=4)
+
+
+def main():
+    tag = sys.argv[1]
+    print(f"Running script with tag: {tag}")
+    generate_long_schema(tag)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/schemas/requirements.txt b/schemas/requirements.txt
new file mode 100644
index 0000000..6d7fa5e
--- /dev/null
+++ b/schemas/requirements.txt
@@ -0,0 +1,2 @@
+numpy==2.3.1
+pandas==2.3.1
diff --git a/schemas/template-isaric-long.json b/schemas/template-isaric-long.json
new file mode 100644
index 0000000..f459d49
--- /dev/null
+++ b/schemas/template-isaric-long.json
@@ -0,0 +1,122 @@
+{
+    "$schema": "http://json-schema.org/draft-07/schema#",
+    "title": "ISARIC Long Schema",
+    "additionalProperties": false,
+    "type": "object",
+    "properties": {
+        "subjid": {
+            "type": "string",
+            "description": "Patient Identification Number (PIN). May not uniquely identify a patient."
+        },
+        "dataset_id": {
+            "type": "string",
+            "description": "Study ID that patient belongs to."
+        },
+        "phase": {
+            "type": "string",
+            "enum": [
+                "presentation",
+                "pre_observation",
+                "during_observation",
+                "followup",
+                "outcome"
+            ],
+            "description": "Healthcare encounter phase"
+        },
+        "attribute": {
+            "type": "string",
+            "description": "Name of the attribute/event being recorded (e.g. temperature, blood pressure, etc.).  Where an attribute with the same or substantially similar semantics exist in ARC, that attribute name must be used."
+        },
+        "attribute_unit": {
+            "type": [
+                "string",
+                "null"
+            ],
+            "description": "Unit of the attribute being recorded (e.g. 'mg/dL'). Null if the attribute has no unit."
+        },
+        "arcver": {
+            "type": [
+                "string",
+                "null"
+            ],
+            "description": "ARC version used to generate the CRF. Null if not generated from ARC."
+        },
+        "value": {
+            "type": [
+                "string",
+                "null"
+            ],
+            "description": "String data associated with the attribute."
+        },
+        "value_bool": {
+            "type": [
+                "boolean",
+                "null"
+            ],
+            "description": "Use for Yes/No attributes."
+        },
+        "value_num": {
+            "type": [
+                "number",
+                "null"
+            ],
+            "description": "Value of numerical measurements, e.g. temperature, BP."
+        },
+        "reldate_adm": {
+            "type": [
+                "integer",
+                "null"
+            ],
+            "description": "Relative day since admission."
+        },
+        "reldate_adm_end": {
+            "type": [
+                "integer",
+                "null"
+            ],
+            "description": "Relative day since admission, end of period starting from reldate_adm. Should be Null if event spans <= 1 day."
+        },
+        "date": {
+            "type": [
+                "string",
+                "null"
+            ],
+            "format": "datetime",
+            "description": "Date of event."
+        },
+        "duration": {
+            "type": [
+                "integer",
+                "null"
+            ],
+            "description": "Duration of the event in days."
+        }
+    },
+    "oneOf": [
+        {
+            "required": [
+                "value"
+            ]
+        },
+        {
+            "required": [
+                "value_bool"
+            ]
+        },
+        {
+            "required": [
+                "value_num"
+            ]
+        }
+    ],
+    "required": [
+        "subjid",
+        "dataset_id",
+        "phase",
+        "attribute",
+        "arcver",
+        "reldate_adm",
+        "date",
+        "duration"
+    ]
+}
\ No newline at end of file

From a1c785510b0ef9cde0ebaee19e34b1672959afab Mon Sep 17 00:00:00 2001
From: Pip Liggins <philippa.liggins@dtc.ox.ac.uk>
Date: Mon, 21 Jul 2025 15:48:21 +0100
Subject: [PATCH 02/13] fix schema 'const' being given a list of options

---
 schemas/isaric_schema.py | 28 +++++++++++++++++++++-------
 1 file changed, 21 insertions(+), 7 deletions(-)

diff --git a/schemas/isaric_schema.py b/schemas/isaric_schema.py
index 9e3f371..d66fa37 100644
--- a/schemas/isaric_schema.py
+++ b/schemas/isaric_schema.py
@@ -25,8 +25,12 @@ def attrs_with_enums(arc, types: list[str], all_types: list[str]):
     arc_long_with_enums = arc[arc["Type"].isin(types)]
 
     for options, group in arc_long_with_enums.groupby("Answer Options"):
+        if len(group) == 1:
+            name = {"const": group.Variable.iloc[0]}
+        else:
+            name = {"enum": group.Variable.tolist()}
         rule = {
-            "properties": {"attribute": {"enum": group.Variable.tolist()}},
+            "properties": {"attribute": name},
             "required": ["value"],
         }
         rule["properties"]["value"] = {"type": "string"}
@@ -47,8 +51,12 @@ def attrs_with_lists(arc, types: list[str], all_types: list[str]):
     arc_long_lists = arc[arc["Type"].isin(types)]
 
     for list_file, group in arc_long_lists.groupby("List"):
+        if len(group) == 1:
+            name = {"const": group.Variable.iloc[0]}
+        else:
+            name = {"enum": group.Variable.tolist()}
         rule = {
-            "properties": {"attribute": {"const": group.Variable.tolist()}},
+            "properties": {"attribute": name},
             "required": ["value"],
         }
         file_name = (list_file + ".csv").split("_")
@@ -70,8 +78,12 @@ def numeric_attrs(arc, types: list[str], all_types: list[str]):
 
     for min_max, group in arc_long_numeric.groupby(["Minimum", "Maximum"]):
         min, max = min_max
+        if len(group) == 1:
+            name = {"const": group.Variable.iloc[0]}
+        else:
+            name = {"enum": group.Variable.tolist()}
         rule = {
-            "properties": {"attribute": {"const": group.Variable.tolist()}},
+            "properties": {"attribute": name},
             "required": ["value_num", "attribute_unit"],
         }
         rule["properties"]["value_num"] = {
@@ -90,8 +102,12 @@ def date_attrs(arc, types: list[str], all_types: list[str]):
     arc_long_dates = arc[arc["Type"].isin(types)]
 
     for input_type, group in arc_long_dates.groupby("Type"):
+        if len(group) == 1:
+            name = {"const": group.Variable.iloc[0]}
+        else:
+            name = {"enum": group.Variable.tolist()}
         rule = {
-            "properties": {"attribute": {"const": group.Variable.tolist()}},
+            "properties": {"attribute": name},
             "required": ["value"],
         }
         if input_type == "date_dmy":
@@ -107,9 +123,7 @@ def date_attrs(arc, types: list[str], all_types: list[str]):
 def generic_str_attrs(arc, types: list[str], all_types: list[str]):
     arc_long_other_str = arc[arc["Type"].isin(types)]
 
-    rule = {
-        "properties": {"attribute": {"const": arc_long_other_str.Variable.tolist()}}
-    }
+    rule = {"properties": {"attribute": {"enum": arc_long_other_str.Variable.tolist()}}}
     rule["properties"]["value"] = {"type": "string"}
     rule["required"] = ["value"]
 

From 0c21cbccc41a05dacebc568b6c5d26369f356536 Mon Sep 17 00:00:00 2001
From: Pip Liggins <philippa.liggins@dtc.ox.ac.uk>
Date: Tue, 29 Jul 2025 16:19:21 +0100
Subject: [PATCH 03/13] Stop numerical variables being dropped when min/max is
 null

---
 schemas/isaric_schema.py | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/schemas/isaric_schema.py b/schemas/isaric_schema.py
index d66fa37..a99bffb 100644
--- a/schemas/isaric_schema.py
+++ b/schemas/isaric_schema.py
@@ -76,7 +76,9 @@ def numeric_attrs(arc, types: list[str], all_types: list[str]):
     rules = []
     arc_long_numeric = arc[arc["Type"].isin(types)]
 
-    for min_max, group in arc_long_numeric.groupby(["Minimum", "Maximum"]):
+    for min_max, group in arc_long_numeric.groupby(
+        ["Minimum", "Maximum"], dropna=False
+    ):
         min, max = min_max
         if len(group) == 1:
             name = {"const": group.Variable.iloc[0]}
@@ -86,11 +88,11 @@ def numeric_attrs(arc, types: list[str], all_types: list[str]):
             "properties": {"attribute": name},
             "required": ["value_num", "attribute_unit"],
         }
-        rule["properties"]["value_num"] = {
-            "type": "number",
-            "minimum": float(min),
-            "maximum": float(max),
-        }
+        rule["properties"]["value_num"] = {"type": "number"}
+        if not pd.isna(min):
+            rule["properties"]["value_num"]["minimum"] = float(min)
+        if not pd.isna(max):
+            rule["properties"]["value_num"]["maximum"] = float(max)
 
         rules.append(rule)
     all_types = [t for t in all_types if t not in types]

From f627daab65e567e5e3db5674fed1f92631506861 Mon Sep 17 00:00:00 2001
From: Pip Liggins <philippa.liggins@dtc.ox.ac.uk>
Date: Thu, 7 Aug 2025 13:27:49 +0100
Subject: [PATCH 04/13] edit schemas

---
 schemas/isaric-core.json          | 41 +++++++++++++++++++++++++++----
 schemas/isaric_schema.py          |  2 +-
 schemas/template-isaric-long.json | 21 +++++++++++-----
 3 files changed, 52 insertions(+), 12 deletions(-)

diff --git a/schemas/isaric-core.json b/schemas/isaric-core.json
index 78e4b92..5d0056e 100644
--- a/schemas/isaric-core.json
+++ b/schemas/isaric-core.json
@@ -1,5 +1,5 @@
 {
-    "$schema": "http://json-schema.org/draft-07/schema#",
+    "$schema": "http://json-schema.org/draft-07/schema",
     "title": "ISARIC Core Wide Schema",
     "type": "object",
     "properties": {
@@ -39,12 +39,30 @@
             "description": "ISO 3166-1 alpha-3 country code"
         },
         "pres_adm": {
-            "type": "boolean",
-            "description": "Admitted to hospital"
+            "type": "string",
+            "description": "Admitted to hospital",
+            "enum": [
+                "unknown",
+                "yes",
+                "no"
+            ]
         },
         "pres_date": {
             "type": "string",
-            "format": "datetime",
+            "anyOf": [
+                {
+                    "format": "date-time"
+                },
+                {
+                    "format": "date"
+                },
+                {
+                    "pattern": "^[0-9]{4}-[0-9]{2}$"
+                },
+                {
+                    "pattern": "^[0-9]{4}$"
+                }
+            ],
             "description": "Most recent presentation/admission date at this facility"
         },
         "outco_outcome": {
@@ -62,7 +80,20 @@
         },
         "outco_date": {
             "type": "string",
-            "format": "datetime",
+            "anyOf": [
+                {
+                    "format": "date-time"
+                },
+                {
+                    "format": "date"
+                },
+                {
+                    "pattern": "^[0-9]{4}-[0-9]{2}$"
+                },
+                {
+                    "pattern": "^[0-9]{4}$"
+                }
+            ],
             "description": "Outcome date"
         }
     },
diff --git a/schemas/isaric_schema.py b/schemas/isaric_schema.py
index a99bffb..b494e54 100644
--- a/schemas/isaric_schema.py
+++ b/schemas/isaric_schema.py
@@ -86,7 +86,7 @@ def numeric_attrs(arc, types: list[str], all_types: list[str]):
             name = {"enum": group.Variable.tolist()}
         rule = {
             "properties": {"attribute": name},
-            "required": ["value_num", "attribute_unit"],
+            "required": ["value_num"],
         }
         rule["properties"]["value_num"] = {"type": "number"}
         if not pd.isna(min):
diff --git a/schemas/template-isaric-long.json b/schemas/template-isaric-long.json
index f459d49..a23141e 100644
--- a/schemas/template-isaric-long.json
+++ b/schemas/template-isaric-long.json
@@ -81,7 +81,20 @@
                 "string",
                 "null"
             ],
-            "format": "datetime",
+            "anyOf": [
+                {
+                    "format": "date-time"
+                },
+                {
+                    "format": "date"
+                },
+                {
+                    "pattern": "^[0-9]{4}-[0-9]{2}$"
+                },
+                {
+                    "pattern": "^[0-9]{4}$"
+                }
+            ],
             "description": "Date of event."
         },
         "duration": {
@@ -113,10 +126,6 @@
         "subjid",
         "dataset_id",
         "phase",
-        "attribute",
-        "arcver",
-        "reldate_adm",
-        "date",
-        "duration"
+        "attribute"
     ]
 }
\ No newline at end of file

From 199b8e70f17e9d239ad22df696932cf068555260 Mon Sep 17 00:00:00 2001
From: Pip Liggins <philippa.liggins@dtc.ox.ac.uk>
Date: Fri, 8 Aug 2025 14:58:14 +0100
Subject: [PATCH 05/13] strip trailing/leading spaces from variables stored in
 separate files

---
 schemas/isaric_schema.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/schemas/isaric_schema.py b/schemas/isaric_schema.py
index b494e54..d4a6c63 100644
--- a/schemas/isaric_schema.py
+++ b/schemas/isaric_schema.py
@@ -63,7 +63,7 @@ def attrs_with_lists(arc, types: list[str], all_types: list[str]):
         path = Path(*["Lists"] + file_name)
         if not path.exists():
             raise FileNotFoundError(f"List file {list_file} does not exist.")
-        list_enums = pd.read_csv(path).iloc[:, 0].unique().tolist()
+        list_enums = [x.strip() for x in pd.read_csv(path).iloc[:, 0].unique().tolist()]
 
         rule["properties"]["value"] = {"type": "string", "enum": list_enums}
 

From d6e70a9d376d4a4d82bff1806c60823b0aefb62e Mon Sep 17 00:00:00 2001
From: Pip Liggins <philippa.liggins@dtc.ox.ac.uk>
Date: Fri, 8 Aug 2025 15:03:40 +0100
Subject: [PATCH 06/13] Change new schema name

---
 schemas/isaric_schema.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/schemas/isaric_schema.py b/schemas/isaric_schema.py
index d4a6c63..dbdb89f 100644
--- a/schemas/isaric_schema.py
+++ b/schemas/isaric_schema.py
@@ -182,7 +182,7 @@ def generate_long_schema(version):
     template_long["oneOf"] = one_of_rules
 
     # Generate new long schema
-    with open(f"schemas/arc_{version}_isaric_long_schema.json", "w") as f:
+    with open(f"schemas/arc_{version}_isaric_long.schema.json", "w") as f:
         json.dump(template_long, f, indent=4)
 
 

From 758b213cb562c53ccfc4394ff848cfb8353c937b Mon Sep 17 00:00:00 2001
From: Pip Liggins <philippa.liggins@dtc.ox.ac.uk>
Date: Tue, 9 Sep 2025 13:02:59 -0700
Subject: [PATCH 07/13] Add some history to changelog.md file

---
 CHANGELOG.md | 39 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 39 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 4d09127..201fbfa 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -12,3 +12,42 @@ ARC v1.1.3 delivers significant updates to support dengue research and harmonisa
 - Restructured **Dengue ARChetype CRF**, aligning variable definitions with current WHO classification.  
 - Introduced **recommended outcomes for Dengue**, enabling more standardised reporting across studies.  
 
+### Interoperability
+Standardised term code lists have been revised to align with **SNOMED‑CT**, **LOINC**, and **UMLS**.
+
+## ARC v1.1.0 (09 May 2025)
+
+### Overview
+ARC v1.1.0 delivers a substantial expansion of the data model, introduces a dedicated **Acute Respiratory Infection (ARI)** preset, and separates **signs** from **symptoms** to improve semantic clarity and analytic power.
+
+### Column / Preset Changes
+| Category | Details |
+|---|---|
+| **New presets / columns (1)** | `preset_ARChetype Syndromic CRF_ARI` **(new)** |
+| **Renamed / (4)** | `preset_ARChetype CRF_Covid` → `preset_ARChetype Disease CRF_Covid`<br>`preset_ARChetype CRF_Dengue` → `preset_ARChetype Disease CRF_Dengue`<br>`preset_ARChetype CRF_Mpox` → `preset_ARChetype Disease CRF_Mpox`<br>`preset_ARChetype CRF_H5Nx` → `preset_ARChetype Disease CRF_H5Nx`|
+
+### Variable‑Level Updates
+- **Added variables**: 472 (e.g., `adsym_blurryvis`, `adsym_cough_type`).
+- **Removed variables**: 303 (e.g., `adasses_bacsi_oth`, `adasses_lymph`).
+- **Field‑type changes**: 84 variables.
+- **List updates**: 652 variables.
+- **Answer‑choice updates**: 68 variables.
+- **Signs vs Symptoms**: Clinical **signs** are now represented by brand‑new `sign_*` variables and no longer stored in `sympt_*`, clearly separating objective observations from patient‑reported symptoms.
+
+### Interoperability
+Standardised term code lists have been revised to align with **SNOMED‑CT**, **LOINC**, and **UMLS**.
+
+## ARC v1.0.4  (04 Mar 2025)
+
+### Overview  
+The changes in ARC v1.0.4 are designed to enhance the system’s usability and ensure a more coherent structure for data entry and analysis. Users are encouraged to review their workflows and adjust any scripts or processes to reflect the updated variable names and group structures.
+
+### Key Updates  
+
+**Renaming of Presets**  
+   - The preset "disease" has been renamed to "ARChetype CRF" in the principal ARC CSV and in the lists.
+
+**Updates to Answer Choices**  
+   - The test_biospecimentype variable has been changed to a userlist.
+
+---

From e5d0fc813923fd2f7ea4cc1785a75c5caef5b3f9 Mon Sep 17 00:00:00 2001
From: Pip Liggins <philippa.liggins@dtc.ox.ac.uk>
Date: Tue, 9 Sep 2025 13:47:48 -0700
Subject: [PATCH 08/13] Get units into schema with static variable name

---
 schemas/isaric_schema.py | 39 ++++++++++++++++++++++++++++++++++++---
 1 file changed, 36 insertions(+), 3 deletions(-)

diff --git a/schemas/isaric_schema.py b/schemas/isaric_schema.py
index dbdb89f..f33f923 100644
--- a/schemas/isaric_schema.py
+++ b/schemas/isaric_schema.py
@@ -72,6 +72,32 @@ def attrs_with_lists(arc, types: list[str], all_types: list[str]):
     return rules, all_types
 
 
+def attrs_with_units(arc, types: list[str], all_types: list[str]):
+    rules = []
+    vars_with_units = arc[arc["Type"].isin(types)]["Variable"]
+
+    for var in vars_with_units:
+        unit_options = arc[arc["Variable"].str.startswith(var + "_")][
+            "Variable"
+        ].to_list()
+
+        units = [u.removeprefix(var + "_") for u in unit_options]
+
+        rule = {
+            "properties": {
+                "attribute": {"const": var},
+                "attribute_unit": {"enum": units},
+                "value_num": {"type": "number"},
+            },
+            "required": ["value_num", "attribute_unit"],
+        }
+
+        rules.append(rule)
+
+    all_types = [t for t in all_types if t not in types]
+    return rules, all_types
+
+
 def numeric_attrs(arc, types: list[str], all_types: list[str]):
     rules = []
     arc_long_numeric = arc[arc["Type"].isin(types)]
@@ -143,9 +169,9 @@ def generate_long_schema(version):
         template_long = json.load(f)
 
     # Drop the core properties from the long schema
-    # Don't include descriptive, file, or NaN types (unwanted as stored attributes)
+    # Don't include descriptive or file types (unwanted as stored attributes)
     arc_long = arc[~arc.Variable.isin(template_core["properties"].keys())]
-    arc_long = arc_long[~(arc_long.Type.isin(["descriptive", np.nan, "file"]))]
+    arc_long = arc_long[~(arc_long.Type.isin(["descriptive", "file"]))]
 
     # Get all the response types from ARC
     all_types = arc_long.Type.unique().tolist()
@@ -157,6 +183,8 @@ def generate_long_schema(version):
         arc_long, ["list", "user_list", "multi_list"], all_types
     )
 
+    unit_rules, all_types = attrs_with_units(arc_long, [np.nan], all_types)
+
     numeric_rules, all_types = numeric_attrs(arc_long, ["number", "calc"], all_types)
 
     date_rules, all_types = date_attrs(
@@ -169,7 +197,12 @@ def generate_long_schema(version):
 
     # Combine all rules into one list
     one_of_rules = (
-        enum_rules + list_rules + numeric_rules + date_rules + other_str_rules
+        enum_rules
+        + list_rules
+        + unit_rules
+        + numeric_rules
+        + date_rules
+        + other_str_rules
     )
 
     # check no types have been missed

From 3bf260b49017ec4833008129146638ad3373c45b Mon Sep 17 00:00:00 2001
From: Pip Liggins <philippa.liggins@dtc.ox.ac.uk>
Date: Tue, 9 Sep 2025 14:18:27 -0700
Subject: [PATCH 09/13] Pass ARC with remaining unmapped rows to each function
 Instead of a list of different input types

---
 schemas/isaric_schema.py | 80 ++++++++++++++++++++--------------------
 1 file changed, 39 insertions(+), 41 deletions(-)

diff --git a/schemas/isaric_schema.py b/schemas/isaric_schema.py
index f33f923..82de248 100644
--- a/schemas/isaric_schema.py
+++ b/schemas/isaric_schema.py
@@ -20,9 +20,10 @@ def get_enums(options):
     ]
 
 
-def attrs_with_enums(arc, types: list[str], all_types: list[str]):
+def attrs_with_enums(arc, types: list[str]):
     rules = []
-    arc_long_with_enums = arc[arc["Type"].isin(types)]
+    arc_filter = arc["Type"].isin(types)
+    arc_long_with_enums = arc[arc_filter]
 
     for options, group in arc_long_with_enums.groupby("Answer Options"):
         if len(group) == 1:
@@ -41,14 +42,14 @@ def attrs_with_enums(arc, types: list[str], all_types: list[str]):
             rule["properties"]["value"]["enum"] = enums
 
         rules.append(rule)
-    # drop from the list of all types
-    all_types = [t for t in all_types if t not in types]
-    return rules, all_types
 
+    return rules, arc[~arc_filter]
 
-def attrs_with_lists(arc, types: list[str], all_types: list[str]):
+
+def attrs_with_lists(arc, types: list[str]):
     rules = []
-    arc_long_lists = arc[arc["Type"].isin(types)]
+    arc_filter = arc["Type"].isin(types)
+    arc_long_lists = arc[arc_filter]
 
     for list_file, group in arc_long_lists.groupby("List"):
         if len(group) == 1:
@@ -68,18 +69,20 @@ def attrs_with_lists(arc, types: list[str], all_types: list[str]):
         rule["properties"]["value"] = {"type": "string", "enum": list_enums}
 
         rules.append(rule)
-    all_types = [t for t in all_types if t not in types]
-    return rules, all_types
+    return rules, arc[~arc_filter]
 
 
-def attrs_with_units(arc, types: list[str], all_types: list[str]):
+def attrs_with_units(arc, types: list[str]):
     rules = []
-    vars_with_units = arc[arc["Type"].isin(types)]["Variable"]
+    arc_filter = arc["Type"].isin(types)
+    vars_with_units = arc[arc_filter]["Variable"]
+    arc_vars_to_remove = vars_with_units.copy().to_list()
 
     for var in vars_with_units:
         unit_options = arc[arc["Variable"].str.startswith(var + "_")][
             "Variable"
         ].to_list()
+        arc_vars_to_remove += unit_options
 
         units = [u.removeprefix(var + "_") for u in unit_options]
 
@@ -94,13 +97,13 @@ def attrs_with_units(arc, types: list[str], all_types: list[str]):
 
         rules.append(rule)
 
-    all_types = [t for t in all_types if t not in types]
-    return rules, all_types
+    return rules, arc[~arc["Variable"].isin(arc_vars_to_remove)]
 
 
-def numeric_attrs(arc, types: list[str], all_types: list[str]):
+def numeric_attrs(arc, types: list[str]):
     rules = []
-    arc_long_numeric = arc[arc["Type"].isin(types)]
+    arc_filter = arc["Type"].isin(types)
+    arc_long_numeric = arc[arc_filter]
 
     for min_max, group in arc_long_numeric.groupby(
         ["Minimum", "Maximum"], dropna=False
@@ -121,13 +124,13 @@ def numeric_attrs(arc, types: list[str], all_types: list[str]):
             rule["properties"]["value_num"]["maximum"] = float(max)
 
         rules.append(rule)
-    all_types = [t for t in all_types if t not in types]
-    return rules, all_types
+    return rules, arc[~arc_filter]
 
 
-def date_attrs(arc, types: list[str], all_types: list[str]):
+def date_attrs(arc, types: list[str]):
     rules = []
-    arc_long_dates = arc[arc["Type"].isin(types)]
+    arc_filter = arc["Type"].isin(types)
+    arc_long_dates = arc[arc_filter]
 
     for input_type, group in arc_long_dates.groupby("Type"):
         if len(group) == 1:
@@ -144,19 +147,18 @@ def date_attrs(arc, types: list[str], all_types: list[str]):
             rule["properties"]["value"] = {"type": "string", "format": "date-time"}
 
         rules.append(rule)
-    all_types = [t for t in all_types if t not in types]
-    return rules, all_types
+    return rules, arc[~arc_filter]
 
 
-def generic_str_attrs(arc, types: list[str], all_types: list[str]):
-    arc_long_other_str = arc[arc["Type"].isin(types)]
+def generic_str_attrs(arc, types: list[str]):
+    arc_filter = arc["Type"].isin(types)
+    arc_long_other_str = arc[arc_filter]
 
     rule = {"properties": {"attribute": {"enum": arc_long_other_str.Variable.tolist()}}}
     rule["properties"]["value"] = {"type": "string"}
     rule["required"] = ["value"]
 
-    all_types = [t for t in all_types if t not in types]
-    return [rule], all_types
+    return [rule], arc[~arc_filter]
 
 
 def generate_long_schema(version):
@@ -173,26 +175,21 @@ def generate_long_schema(version):
     arc_long = arc[~arc.Variable.isin(template_core["properties"].keys())]
     arc_long = arc_long[~(arc_long.Type.isin(["descriptive", "file"]))]
 
-    # Get all the response types from ARC
-    all_types = arc_long.Type.unique().tolist()
-
     # Generate rules for each type of attribute
-    enum_rules, all_types = attrs_with_enums(arc_long, ["radio", "checkbox"], all_types)
+    enum_rules, arc_no_enums = attrs_with_enums(arc_long, ["radio", "checkbox"])
 
-    list_rules, all_types = attrs_with_lists(
-        arc_long, ["list", "user_list", "multi_list"], all_types
+    list_rules, arc_no_lists = attrs_with_lists(
+        arc_no_enums, ["list", "user_list", "multi_list"]
     )
 
-    unit_rules, all_types = attrs_with_units(arc_long, [np.nan], all_types)
+    unit_rules, arc_no_units = attrs_with_units(arc_no_lists, [np.nan])
 
-    numeric_rules, all_types = numeric_attrs(arc_long, ["number", "calc"], all_types)
+    numeric_rules, arc_no_numbers = numeric_attrs(arc_no_units, ["number", "calc"])
 
-    date_rules, all_types = date_attrs(
-        arc_long, ["date_dmy", "datetime_dmy"], all_types
-    )
+    date_rules, arc_no_dates = date_attrs(arc_no_numbers, ["date_dmy", "datetime_dmy"])
 
-    other_str_rules, all_types = generic_str_attrs(
-        arc_long, ["text", "notes"], all_types
+    other_str_rules, arc_no_other_str = generic_str_attrs(
+        arc_no_dates, ["text", "notes"]
     )
 
     # Combine all rules into one list
@@ -206,10 +203,11 @@ def generate_long_schema(version):
     )
 
     # check no types have been missed
-    if len(all_types) > 0:
+    if len(arc_no_other_str) > 0:
         raise ValueError(
-            f"The following types were not processed: {', '.join(all_types)}. "
-            "Please check the ARC.csv file for any new types."
+            "The following rows were not processed: \n",
+            arc_no_other_str,
+            "Please check the ARC.csv file for any new types.",
         )
 
     template_long["oneOf"] = one_of_rules

From a856838c227dca6572673e76ee7b4779ab6ee6a5 Mon Sep 17 00:00:00 2001
From: Pip Liggins <philippa.liggins@dtc.ox.ac.uk>
Date: Wed, 22 Oct 2025 15:51:34 +0100
Subject: [PATCH 10/13] Update long ISARIC schema

---
 schemas/template-isaric-long.json | 26 +++++++-------------------
 1 file changed, 7 insertions(+), 19 deletions(-)

diff --git a/schemas/template-isaric-long.json b/schemas/template-isaric-long.json
index a23141e..6210e5b 100644
--- a/schemas/template-isaric-long.json
+++ b/schemas/template-isaric-long.json
@@ -12,6 +12,13 @@
             "type": "string",
             "description": "Study ID that patient belongs to."
         },
+        "event_id": {
+            "type": [
+                "string",
+                "null"
+            ],
+            "description": "ID code to link different attributes which relate to the same event (e.g. medication being administered) together."
+        },
         "phase": {
             "type": "string",
             "enum": [
@@ -48,13 +55,6 @@
             ],
             "description": "String data associated with the attribute."
         },
-        "value_bool": {
-            "type": [
-                "boolean",
-                "null"
-            ],
-            "description": "Use for Yes/No attributes."
-        },
         "value_num": {
             "type": [
                 "number",
@@ -69,13 +69,6 @@
             ],
             "description": "Relative day since admission."
         },
-        "reldate_adm_end": {
-            "type": [
-                "integer",
-                "null"
-            ],
-            "description": "Relative day since admission, end of period starting from reldate_adm. Should be Null if event spans <= 1 day."
-        },
         "date": {
             "type": [
                 "string",
@@ -111,11 +104,6 @@
                 "value"
             ]
         },
-        {
-            "required": [
-                "value_bool"
-            ]
-        },
         {
             "required": [
                 "value_num"

From 335148fffacb7e299844f01d39c70c3006a2e23f Mon Sep 17 00:00:00 2001
From: Pip Liggins <philippa.liggins@dtc.ox.ac.uk>
Date: Wed, 22 Oct 2025 15:53:26 +0100
Subject: [PATCH 11/13] Update .gitignore

---
 .gitignore | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/.gitignore b/.gitignore
index b694934..1055d1b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1 +1,6 @@
-.venv
\ No newline at end of file
+.venv
+.vscode/
+.DS_Store
+
+__pycache__/
+*.pyc
\ No newline at end of file

From 0965b5e849e834a542637e383bbc30f039636704 Mon Sep 17 00:00:00 2001
From: Pip Liggins <philippa.liggins@dtc.ox.ac.uk>
Date: Wed, 22 Oct 2025 16:40:20 +0100
Subject: [PATCH 12/13] Put units back in names to match arc

---
 schemas/isaric_schema.py | 58 ++++++++++++++++++----------------------
 1 file changed, 26 insertions(+), 32 deletions(-)

diff --git a/schemas/isaric_schema.py b/schemas/isaric_schema.py
index 82de248..b840c27 100644
--- a/schemas/isaric_schema.py
+++ b/schemas/isaric_schema.py
@@ -72,32 +72,33 @@ def attrs_with_lists(arc, types: list[str]):
     return rules, arc[~arc_filter]
 
 
-def attrs_with_units(arc, types: list[str]):
-    rules = []
-    arc_filter = arc["Type"].isin(types)
-    vars_with_units = arc[arc_filter]["Variable"]
-    arc_vars_to_remove = vars_with_units.copy().to_list()
+# Currently not used
+# def attrs_with_units(arc, types: list[str]):
+#     rules = []
+#     arc_filter = arc["Type"].isin(types)
+#     vars_with_units = arc[arc_filter]["Variable"]
+#     arc_vars_to_remove = vars_with_units.copy().to_list()
 
-    for var in vars_with_units:
-        unit_options = arc[arc["Variable"].str.startswith(var + "_")][
-            "Variable"
-        ].to_list()
-        arc_vars_to_remove += unit_options
+#     for var in vars_with_units:
+#         unit_options = arc[arc["Variable"].str.startswith(var + "_")][
+#             "Variable"
+#         ].to_list()
+#         arc_vars_to_remove += unit_options
 
-        units = [u.removeprefix(var + "_") for u in unit_options]
+#         units = [u.removeprefix(var + "_") for u in unit_options]
 
-        rule = {
-            "properties": {
-                "attribute": {"const": var},
-                "attribute_unit": {"enum": units},
-                "value_num": {"type": "number"},
-            },
-            "required": ["value_num", "attribute_unit"],
-        }
+#         rule = {
+#             "properties": {
+#                 "attribute": {"const": var},
+#                 "attribute_unit": {"enum": units},
+#                 "value_num": {"type": "number"},
+#             },
+#             "required": ["value_num", "attribute_unit"],
+#         }
 
-        rules.append(rule)
+#         rules.append(rule)
 
-    return rules, arc[~arc["Variable"].isin(arc_vars_to_remove)]
+#     return rules, arc[~arc["Variable"].isin(arc_vars_to_remove)]
 
 
 def numeric_attrs(arc, types: list[str]):
@@ -171,9 +172,9 @@ def generate_long_schema(version):
         template_long = json.load(f)
 
     # Drop the core properties from the long schema
-    # Don't include descriptive or file types (unwanted as stored attributes)
+    # Don't include descriptive, file types or NaN's (unwanted as stored attributes)
     arc_long = arc[~arc.Variable.isin(template_core["properties"].keys())]
-    arc_long = arc_long[~(arc_long.Type.isin(["descriptive", "file"]))]
+    arc_long = arc_long[~(arc_long.Type.isin(["descriptive", "file", np.nan]))]
 
     # Generate rules for each type of attribute
     enum_rules, arc_no_enums = attrs_with_enums(arc_long, ["radio", "checkbox"])
@@ -182,9 +183,7 @@ def generate_long_schema(version):
         arc_no_enums, ["list", "user_list", "multi_list"]
     )
 
-    unit_rules, arc_no_units = attrs_with_units(arc_no_lists, [np.nan])
-
-    numeric_rules, arc_no_numbers = numeric_attrs(arc_no_units, ["number", "calc"])
+    numeric_rules, arc_no_numbers = numeric_attrs(arc_no_lists, ["number", "calc"])
 
     date_rules, arc_no_dates = date_attrs(arc_no_numbers, ["date_dmy", "datetime_dmy"])
 
@@ -194,12 +193,7 @@ def generate_long_schema(version):
 
     # Combine all rules into one list
     one_of_rules = (
-        enum_rules
-        + list_rules
-        + unit_rules
-        + numeric_rules
-        + date_rules
-        + other_str_rules
+        enum_rules + list_rules + numeric_rules + date_rules + other_str_rules
     )
 
     # check no types have been missed

From afef71b3b90065a4494de571ea90bca7f48ec3a6 Mon Sep 17 00:00:00 2001
From: Pip Liggins <philippa.liggins@dtc.ox.ac.uk>
Date: Thu, 23 Oct 2025 10:13:38 +0100
Subject: [PATCH 13/13] Edit to add V1.1.1 back into changelog

---
 CHANGELOG.md | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 201fbfa..cd1d2bb 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -12,8 +12,16 @@ ARC v1.1.3 delivers significant updates to support dengue research and harmonisa
 - Restructured **Dengue ARChetype CRF**, aligning variable definitions with current WHO classification.  
 - Introduced **recommended outcomes for Dengue**, enabling more standardised reporting across studies.  
 
+## ARC v1.1.1 (02 Jul 2025)
+
+### Overview
+In ARC v1.1.1, variables in the ONSET & PRESENTATION section were updated to use the prefix pres_ instead of date_ to better reflect their meaning..
+
+### Variable‑Level Updates
+Prefix change in ONSET & PRESENTATION section: Variables previously using the date_ prefix were renamed to pres_ (e.g., date_adm, → pres_adm).
+
 ### Interoperability
-Standardised term code lists have been revised to align with **SNOMED‑CT**, **LOINC**, and **UMLS**.
+Standardised term code lists have been revised to align with SNOMED‑CT, LOINC, and UMLS.
 
 ## ARC v1.1.0 (09 May 2025)