From 0509cb79c0326a55f0cd9c427de6fb7aac9e1db0 Mon Sep 17 00:00:00 2001 From: Florian Nachtigall Date: Thu, 16 Oct 2025 15:18:12 +0200 Subject: [PATCH] Add data theme definitions table to /guides - Added `theme-definitions.json` with structured descriptions of all data themes (based on the [Definitions Proposal](https://lf-overturemaps.atlassian.net/wiki/spaces/PROJ/pages/353927169/Definitions+Proposal)) - Added `theme-definitions.mdx` page under /guides to display the themes table - Introduced `ThemesTable` React component for rendering and styling the table Note: Theme definitions are still incomplete and may contain errors. --- docs/guides/theme-definitions.json | 614 +++++++++++++++++++++++++++++ docs/guides/theme-definitions.mdx | 16 + sidebars.js | 1 + src/components/themesTable.js | 536 +++++++++++++++++++++++++ 4 files changed, 1167 insertions(+) create mode 100644 docs/guides/theme-definitions.json create mode 100644 docs/guides/theme-definitions.mdx create mode 100644 src/components/themesTable.js diff --git a/docs/guides/theme-definitions.json b/docs/guides/theme-definitions.json new file mode 100644 index 00000000..dddd0eb5 --- /dev/null +++ b/docs/guides/theme-definitions.json @@ -0,0 +1,614 @@ +{ + "themes": { + "Places": { + "brief_description": "Concrete, physically identifiable, stationary destinations.", + "licenses": [ + { + "name": "CDLA Permissive 2.0.", + "url": "https://cdla.dev/permissive-2-0/" + }, + { + "name": "Apache 2.0.", + "url": "https://www.apache.org/licenses/LICENSE-2.0" + } + ], + "sources": [ + { + "name": "Meta", + "freshness": "", + "type": "commercial", + "url": "" + }, + { + "name": "Foursquare", + "freshness": "", + "type": "commercial", + "url": "" + }, + { + "name": "Microsoft", + "freshness": "", + "type": "commercial", + "url": "" + }, + { + "name": "PinMeTo", + "freshness": "", + "type": "commercial", + "url": "" + } + ], + "gers": { + "gersified": { + "flag": true, + "note": "" + }, + "bridge_files": { + "flag": true, + "note": "" + }, + "registry": { + "flag": true, + "note": "" + }, + "data_changelog": { + "flag": true, + "note": "" + } + }, + "signal_confidence_score": { + "flag": true, + "note": "" + }, + "excluded_by_design": [ + "Concrete and physically identifiable: to exclude divisions and addresses", + "Stationary: to exclude noise from UGC datasets for things like food carts or vehicles/boats/aircraft", + "Destination: to exclude bus stops and train platforms and other intermediate waypoints", + "Private: we only include places that do not include PII" + ], + "freshness": { + "release_frequency": "monthly", + "last_updated": "" + }, + "quality_assurance": { + "coverage_summary": "Global", + "coverage": [ + "Global", + "US ~73%" + ], + "quality_summary": "Duplicates", + "quality": [ + "Duplicates", + "High junk rate", + "Low attribute completeness" + ], + "violations": [] + }, + "filtering": { + "summary": "None, except existence `confidence` > 0.2", + "location": [ + "" + ], + "topological": [], + "geometrical": [], + "properties": [ + "`confidence` > 0.2" + ], + "others": [] + }, + "matching": { + "summary": "ML-based matching with clustering", + "logic": [ + "Identifying potential matching pairs based on quadkey", + "ML-based matching based on attribute similarity" + ], + "properties": [ + "`name`", + "`address`", + "`phone number`", + "`house number`", + "`website`", + "`spatial distance`" + ] + }, + "merging": { + "summary": "Promotion of single source; no merging of attributes", + "logic": [ + "Clustering of matched places", + "Promotion of place from source with the highest match count", + "No merging of attributes between matches" + ], + "constraints": [] + } + }, + "Buildings": { + "brief_description": "Permanent human-made structures with a roof.", + "licenses": [ + { + "name": "ODbL", + "url": "https://opendatacommons.org/licenses/odbl/" + } + ], + "sources": [ + { + "name": "OSM", + "freshness": "monthly", + "type": "community", + "url": "https://osm.org/" + }, + { + "name": "Esri", + "freshness": "biannually", + "type": "community", + "url": "https://communitymaps.arcgis.com/home" + }, + { + "name": "Vancouver", + "freshness": "biannually", + "type": "authoritative", + "url": "https://opendata.vancouver.ca/" + }, + { + "name": "IGN Spain", + "freshness": "one-time ingestion", + "type": "authoritative", + "url": "https://www.ign.es/" + }, + { + "name": "Microsoft", + "freshness": "irregularly", + "type": "ML-derived", + "url": "https://github.com/microsoft/GlobalMLBuildingFootprints" + }, + { + "name": "Google", + "freshness": "one-time ingestion", + "type": "ML-derived", + "url": "https://sites.research.google/open-buildings/" + }, + { + "name": "East Asian countries", + "freshness": "one-time ingestion", + "type": "ML-derived", + "url": "https://zenodo.org/records/8174931" + } + ], + "gers": { + "gersified": { + "flag": true, + "note": "only building, not building_part" + }, + "bridge_files": { + "flag": true, + "note": "One-to-one matches only" + }, + "registry": { + "flag": true, + "note": "" + }, + "data_changelog": { + "flag": true, + "note": "" + } + }, + "signal_confidence_score": { + "flag": false, + "note": "" + }, + "excluded_by_design": [ + "Features that are well defined in other themes. Examples:", + "Physical “regions”", + "Places of business" + ], + "freshness": { + "release_frequency": "monthly", + "last_updated": "" + }, + "quality_assurance": { + "coverage_summary": "Global", + "coverage": "Global", + "quality_summary": "Lower footprint precision in ML-derived sources", + "quality": [ + "Lower footprint precision in Global South due to high share of ML-derived buildings" + ], + "violations": [ + "Pre-match violations: `building_tiny`, `building_large`, `building_huge`, `building_invalid_geometry`, `building_duplicate_record_id`.", + "Post-merge violations: `building_transportation_intersection`, `building_water_intersection`, `building_invalid_area`, `building_too_many_small_angles`" + ] + }, + "filtering": { + "summary": "Overlap allowed within source, but not between sources", + "location": [ + "Buildings in water" + ], + "topological": [ + "Overlap allowed within source, but not between sources" + ], + "geometrical": [ + "Geometry identical to source", + "(Multi)Polygons with too many sharp angles are excluded", + "Footprint area > 10m for ML-derived sources" + ], + "properties": [ + "`height` < 900m" + ], + "others": [] + }, + "matching": { + "summary": "Geometric similarity (Intersection-over-Union > 0.5)", + "logic": "Intersection-over-Union > 0.5", + "properties": [ + "`geometry`" + ] + }, + "merging": { + "summary": "Hierarchical non-overlapping spatial merge", + "logic": [ + "Hierarchical merging of non-overlapping footprints", + "Merging of building height attributes between matches" + ], + "constraints": [ + "No spatial overlap for footprint merging", + "Intersection-over-Union > 0.5 for attribute merging" + ] + } + }, + "Addresses": { + "brief_description": "Unique geographic points representing a physical address location.", + "licenses": [ + { + "name": "Various licenses", + "url": "https://docs.overturemaps.org/attribution/#addresses" + } + ], + "sources": [ + { + "name": "OpenAddresses", + "freshness": "", + "type": "community", + "url": "https://openaddresses.io/" + }, + { + "name": "AddressForAll", + "freshness": "", + "type": "community", + "url": "https://www.addressforall.org/en/" + }, + { + "name": "City of New York", + "freshness": "", + "type": "authoritative", + "url": "https://data.cityofnewyork.us/City-Government/NYC-Address-Points/g6pj-hd8k" + }, + { + "name": "U.S. Department of Transportation", + "freshness": "", + "type": "authoritative", + "url": "https://www.transportation.gov/gis/national-address-database" + } + ], + "gers": { + "gersified": { + "flag": false, + "note": "" + }, + "bridge_files": { + "flag": false, + "note": "" + }, + "registry": { + "flag": true, + "note": "" + }, + "data_changelog": { + "flag": true, + "note": "" + } + }, + "signal_confidence_score": { + "flag": false, + "note": "" + }, + "excluded_by_design": [ + "Open, navigable spaces like fields, parks, or oceans" + ], + "freshness": { + "release_frequency": "", + "last_updated": "" + }, + "quality_assurance": { + "coverage_summary": "37 countries", + "coverage": [ + "Coverage in 37 countries", + "Several countries with partial coverage: US, Germany, Taiwan", + "Datasets have varying levels of completeness in their attributes. A dataset may be missing postcodes or only have partial coverage for `address_levels` for example." + ], + "quality_summary": "Duplicates", + "quality": [ + "Duplicates" + ], + "violations": [] + }, + "filtering": { + "summary": "", + "location": [ + "" + ], + "topological": [], + "geometrical": [], + "properties": [], + "others": [] + }, + "matching": { + "summary": "", + "logic": "n/a (subtypes within a country have a single source)", + "properties": [] + }, + "merging": { + "summary": "", + "logic": "", + "constraints": [] + } + }, + "Transportation": { + "brief_description": "Traversable path segments (roads, railways, trails) or connectors (road intersections).", + "licenses": [ + { + "name": "ODbL", + "url": "https://opendatacommons.org/licenses/odbl/" + } + ], + "sources": [ + { + "name": "OSM", + "freshness": "", + "type": "community", + "url": "https://osm.org/" + }, + { + "name": "TomTom", + "freshness": "", + "type": "commercial", + "url": "https://www.tomtom.com/" + } + ], + "gers": { + "gersified": { + "flag": true, + "note": "" + }, + "bridge_files": { + "flag": true, + "note": "" + }, + "registry": { + "flag": true, + "note": "" + }, + "data_changelog": { + "flag": true, + "note": "" + } + }, + "signal_confidence_score": { + "flag": false, + "note": "" + }, + "excluded_by_design": [ + "Aerial paths, such as flight paths or geostationary satellite orbits", + "Paths traversed by continuous entities: oil pipelines, electric lines" + ], + "freshness": { + "release_frequency": "monthly", + "last_updated": "" + }, + "quality_assurance": { + "coverage_summary": "Global", + "coverage": [ + "The network is matched against TomTom’s (internally), any features in TomTom MNR that do not match are used as a signal indicating a missing road.", + "TomTom also performs GPS trace matching on the network." + ], + "quality_summary": "", + "quality": [ + "Presence of navigational islands" + ], + "violations": [] + }, + "filtering": { + "summary": "Deduplication of nodes", + "location": [], + "topological": [], + "geometrical": [], + "properties": [], + "others": [ + "Deduplication of nodes" + ] + }, + "matching": { + "summary": "", + "logic": "n/a (single source)", + "properties": [] + }, + "merging": { + "summary": "", + "logic": "n/a (single source)", + "constraints": [] + } + }, + "Divisions": { + "brief_description": "Recognized areas for governance, culture, or organization.", + "licenses": [ + { + "name": "ODbL", + "url": "https://opendatacommons.org/licenses/odbl" + } + ], + "sources": [ + { + "name": "OSM", + "freshness": "", + "type": "community", + "url": "https://osm.org/" + }, + { + "name": "geoBoundaries", + "freshness": "", + "type": "community", + "url": "https://www.geoboundaries.org/" + } + ], + "gers": { + "gersified": { + "flag": true, + "note": "" + }, + "bridge_files": { + "flag": true, + "note": "" + }, + "registry": { + "flag": true, + "note": "" + }, + "data_changelog": { + "flag": true, + "note": "" + } + }, + "signal_confidence_score": { + "flag": false, + "note": "" + }, + "excluded_by_design": [ + "Non unique features: An address could relate to multiple Places for example but there should not be more than one address point with the same values." + ], + "freshness": { + "release_frequency": "", + "last_updated": "" + }, + "quality_assurance": { + "coverage_summary": "Global", + "coverage": [ + "Coverage generally aligns with admin_level tags in OSM and geoboundaries datasets.", + "Global coverage of country, dependency, region, and county.", + "Macroregion and macrocounty should be present, but are miscategorized as other subtypes.", + "Subtypes below county (locality, borough. neighborhood, microhood) should be present in every country, but coverage is often spotty." + ], + "quality_summary": "Minor macroregion issues; sub-county coverage spotty", + "quality": [], + "violations": [] + }, + "filtering": { + "summary": "Deduplication; overlap not allowed for countries", + "location": [ + "" + ], + "topological": [ + "Overlap allowed at lower subtypes (ex locality), not allowed in others (ex country)" + ], + "geometrical": [], + "properties": [], + "others": [ + "Deduplication" + ] + }, + "matching": { + "summary": "", + "logic": "", + "properties": [] + }, + "merging": { + "summary": "", + "logic": "", + "constraints": [] + } + }, + "Base": { + "brief_description": "Foundational layers such as land, water, infrastructure, and bathymetry.", + "licenses": [ + { + "name": "ODbL", + "url": "https://opendatacommons.org/licenses/odbl/" + } + ], + "sources": [ + { + "name": "Daylight Coastlines (OSM)", + "freshness": "", + "type": "community", + "url": "https://daylightmap.org/coastlines.html" + }, + { + "name": "ETOPO1", + "freshness": "", + "type": "community", + "url": "https://www.ncei.noaa.gov/products/etopo-global-relief-model" + }, + { + "name": "GLOBathy", + "freshness": "", + "type": "ML-derived", + "url": "https://www.nature.com/articles/s41597-022-01132-9" + }, + { + "name": "ESA WorldCover", + "freshness": "", + "type": "ML-derived", + "url": "https://esa-worldcover.org/en" + } + ], + "gers": { + "gersified": { + "flag": false, + "note": "" + }, + "bridge_files": { + "flag": false, + "note": "" + }, + "registry": { + "flag": false, + "note": "" + }, + "data_changelog": { + "flag": true, + "note": "" + } + }, + "signal_confidence_score": { + "flag": false, + "note": "" + }, + "excluded_by_design": [], + "freshness": { + "release_frequency": "monthly", + "last_updated": "" + }, + "quality_assurance": { + "coverage_summary": "Global", + "coverage": "Features in base are not considered to be their own entities, so non-bathymetry coverage is just basic features from OSM with all the pass through tags.", + "quality_summary": "Derived from OSM tags", + "quality": [], + "violations": [] + }, + "filtering": { + "summary": "Tag-based filtering (non-bathymetry)", + "location": [], + "topological": [], + "geometrical": [], + "properties": [], + "others": [] + }, + "matching": { + "summary": "", + "logic": "n/a (types are single source)", + "properties": [] + }, + "merging": { + "summary": "", + "logic": "n/a (types are single source)", + "constraints": [] + } + } + } +} \ No newline at end of file diff --git a/docs/guides/theme-definitions.mdx b/docs/guides/theme-definitions.mdx new file mode 100644 index 00000000..53c5f2c2 --- /dev/null +++ b/docs/guides/theme-definitions.mdx @@ -0,0 +1,16 @@ +--- +title: Theme Definitions +description: An overview of theme definitions and characteristics +--- + +import data from '@site/docs/guides/theme-definitions.json'; +import ThemesTable from '@site/src/components/themesTable'; + +# Data Theme Definitions + +An overview of each data theme and its key properties. + + + diff --git a/sidebars.js b/sidebars.js index d6dae5ca..abf3c739 100644 --- a/sidebars.js +++ b/sidebars.js @@ -57,6 +57,7 @@ const sidebars = { }, collapsed: true, items: [ + 'guides/theme-definitions', 'guides/addresses', 'guides/base', 'guides/buildings', diff --git a/src/components/themesTable.js b/src/components/themesTable.js new file mode 100644 index 00000000..333d2e0b --- /dev/null +++ b/src/components/themesTable.js @@ -0,0 +1,536 @@ +import React, { useRef, memo, useMemo, useState, useEffect } from "react"; +import YAMLFileResolver from "@site/src/components/shared-libs/yamlFileResolver"; + +const SCHEMA_GROUPS = { + Places: ["places/place.yaml"], + Addresses: ["addresses/address.yaml"], + Buildings: ["buildings/building.yaml", "buildings/building_part.yaml"], + Transportation: ["transportation/segment.yaml", "transportation/connector.yaml"], + Divisions: [ + "divisions/division.yaml", + "divisions/division_area.yaml", + "divisions/division_boundary.yaml", + ], + Base: [ + "base/bathymetry.yaml", + "base/land.yaml", + "base/land_use.yaml", + "base/land_cover.yaml", + "base/infrastructure.yaml", + ], +}; + +const useIsOverflowing = (text, lines) => { + const ref = useRef(null); + const [isOverflowing, setIsOverflowing] = useState(false); + + useEffect(() => { + const el = ref.current; + if (!el) return; + + const lineHeight = parseFloat(getComputedStyle(el).lineHeight || 20); + const maxHeight = lineHeight * lines; + const checkOverflow = () => setIsOverflowing(el.scrollHeight > maxHeight + 1); + + checkOverflow(); + window.addEventListener("resize", checkOverflow); + return () => window.removeEventListener("resize", checkOverflow); + }, [text, lines]); + + return [ref, isOverflowing]; +}; + +const getCroppedCellStyle = (cropped, lines) => ({ + position: "relative", + overflow: cropped ? "hidden" : "visible", + maxHeight: cropped ? `${1.6 * lines}em` : "none", + display: "-webkit-box", + WebkitBoxOrient: "vertical", + WebkitLineClamp: cropped ? lines : "unset", + textOverflow: cropped ? "ellipsis" : "clip", + whiteSpace: "pre-line", + cursor: "pointer", + transition: "max-height 0.3s ease, mask-image 0.2s ease", + ...(cropped && { + maskImage: "linear-gradient(to bottom, black 75%, transparent 100%)", + WebkitMaskImage: "linear-gradient(to bottom, black 75%, transparent 100%)", + }), +}); + +const CroppedText = memo(({ text, expanded, lines = 10 }) => { + const [ref, isOverflowing] = useIsOverflowing(text, lines); + const cropped = !expanded && isOverflowing; + + return ( +
+ {text || "—"} +
+ ); +}); + + +const renderCodeSpans = (text) => { + const parts = String(text).split(/(`[^`]+`)/g); + return parts.map((part, i) => { + if (part.startsWith("`") && part.endsWith("`")) { + return ( + + {part.slice(1, -1)} + + ); + } + return {part}; + }); +}; + +const getGeometryTypes = (schemas) => { + const extract = (geom) => { + if (!geom) return []; + const items = geom.oneOf || geom.allOf || [geom]; + return items + .map((g) => g.$ref?.match(/([A-Za-z]+)\.json$/)?.[1] || "Unknown") + .filter(Boolean); + }; + + return [...new Set(schemas.flatMap(s => extract(s?.properties?.geometry)))]; +}; + +export default function ThemesTable({ data }) { + const themes = data.themes; + const [expandedTheme, setExpandedTheme] = useState(null); + const [schemas, setSchemas] = useState({}); + const parsedSchemas = useMemo(() => schemas, [schemas]); + + useEffect(() => { + const resolver = YAMLFileResolver(); + const allPaths = Object.values(SCHEMA_GROUPS).flat(); + + Promise.all(allPaths.map((p) => resolver.resolve(p))).then((loaded) => { + const result = {}; + let index = 0; + for (const [group, paths] of Object.entries(SCHEMA_GROUPS)) { + result[group] = paths.map(() => loaded[index++]); + } + setSchemas(result); + }); + }, []); + + + const th = { + borderBottom: "2px solid #ccc", + textAlign: "left", + padding: "6px 8px", + backgroundColor: "#f7f7f7", + fontWeight: 600, + fontSize: "0.85rem", + }; + + const td = { + borderBottom: "1px solid #eee", + padding: "6px 8px", + verticalAlign: "top", + fontSize: "0.88rem", + }; + + const toggleExpand = (themeName) => + setExpandedTheme(expandedTheme === themeName ? null : themeName); + + const checkIcon = (val, hasNote = false) => { + if (val) { + return hasNote ? "☑️" : "✅" + } + return "❌" + }; + + const FlagCell = ({ item }) => { + const note = item?.note || "" + const hasNote = Boolean(note) + + return ( + + {checkIcon(item?.flag, hasNote)} + + ) + }; + + const renderList = (arr) => { + if (!Array.isArray(arr) || arr.length === 0) return ; + + return ( + + ); + }; + + const renderNameWithUrl = (name, url, strong = false) => { + const label = name || url || "—"; + + if (url) { + return ( + (e.target.style.textDecoration = "underline")} + onMouseOut={(e) => (e.target.style.textDecoration = "none")} + > + {label} + + ); + } + + return strong ? {label} : {label}; + }; + + const renderSources = (sources) => { + if (!Array.isArray(sources) || sources.length === 0) return ; + + return ( + + ); + }; + + return ( +
+ + + + + + + + + + + + + + + + + + + + + + + + {Object.entries(themes).map(([name, theme]) => { + const schemas = parsedSchemas[name] || []; + const geometries = getGeometryTypes(schemas); + const shortDef = theme.brief_description || "—"; + const coverage = theme.quality_assurance?.coverage_summary || "—"; + const quality = theme.quality_assurance?.quality_summary || "—"; + const filteringSummary = theme.filtering?.summary || "—"; + const matchingSummary = theme.matching?.summary || "—"; + const mergingSummary = theme.merging?.summary || "—"; + const gers = theme.gers || {}; + const freshnessText = theme.freshness?.release_frequency || ""; + + return ( + + toggleExpand(name)} + > + + + + + + + + + + + + + + + + + + + {expandedTheme === name && ( + + + + )} + + ); + })} + +
+ 💡 Click a row to view detailed theme definition. +
ThemeDescriptionCoverageQualityRelease FrequencyLicensesSourcesGERSRegistryChangelogBridge FilesConfidence ScoreGeometry TypesFilteringMatchingMerging
+ {name} + {shortDef}{coverage}{quality}{freshnessText}{renderList(theme.licenses || [])} + + {geometries.join(", ") || "—"} + + + + + +
+ +
+
+ ); +} + +function ExpandedThemeDetails({ theme, schemas }) { + const blockStyle = { marginBottom: "1.2rem" }; + + const boxGrid = { + display: "grid", + gap: "1rem", + gridTemplateColumns: "repeat(auto-fit, minmax(280px, 1fr))", + marginTop: "1.2rem", + marginBottom: "2rem", + }; + + const box = { + border: "1px solid #d9f0f2", + backgroundColor: "#f7fcfc", + padding: "0.9rem 1.1rem", + fontSize: "0.9rem", + }; + + const boxTitle = { + fontWeight: "600", + fontSize: "0.95rem", + marginBottom: "0.4rem", + textTransform: "capitalize", + }; + + const renderList = (arr) => ( + + ); + + const renderKeyValue = (obj) => { + if (!obj || typeof obj !== "object") return ; + + const entries = Object.entries(obj).filter( + ([key]) => !key.toLowerCase().includes("summary") + ); + if (entries.length === 0) return ; + + const capitalize = (s) => + s.charAt(0).toUpperCase() + s.slice(1).replace(/_/g, " "); + + return ( + + ); + }; + + + const renderSchemaSummaries = (schemas) => { + if (!schemas || schemas.length === 0) return null; + return ( +
+
Data Types
+
+ {schemas.map((schema, idx) => { + const props = schema.properties?.properties?.properties || {}; + const geometries = getGeometryTypes([schema]); + + return ( +
+
+ {schema.title} +
+
+ {schema.description} +
+
+ Geometry: {geometries.join(", ") || "—"} +
+ {Object.keys(props).length > 0 && ( +
+ + Show properties + +
    + {Object.entries(props).map(([k, v]) => ( +
  • + {k} — {v.description || "—"} + + {Array.isArray(v.enum) && v.enum.length > 0 && ( +
    + + Show {v.enum.length} values + +
    + {v.enum.join(", ")} +
    +
    + )} +
  • + ))} +
+
+ )} +
+ ); + })} +
+
+ ); + }; + + return ( +
+ {renderSchemaSummaries(schemas)} + + {theme.excluded_by_design && ( +
+
+ Excluded by Design +
+ {renderList(theme.excluded_by_design)} +
+ )} + + {theme.quality_assurance && ( +
+
+ Quality Assurance +
+ {renderKeyValue(theme.quality_assurance)} +
+ )} + +
+ {theme.filtering && ( +
+
Filtering
+ {renderKeyValue(theme.filtering)} +
+ )} + {theme.matching && ( +
+
Matching
+ {renderKeyValue(theme.matching)} +
+ )} + {theme.merging && ( +
+
Merging
+ {renderKeyValue(theme.merging)} +
+ )} +
+
+ ); +}