diff --git a/scripts/properties_to_json.py b/scripts/properties_to_json.py index 0c71261..90146cc 100644 --- a/scripts/properties_to_json.py +++ b/scripts/properties_to_json.py @@ -58,6 +58,7 @@ import csv import json import pathlib +import re DIR = pathlib.Path(__file__).parent.parent CSV_PATH = DIR / "data/properties_description/" @@ -71,6 +72,10 @@ CSV_PATH.glob("*.csv"), key=lambda p: float(p.stem.lstrip("v")), reverse=True ) +def canonicalize(s): + """strips non-letters and lower-cases""" + return re.sub("\\W", "", s).lower() + for csv_path in paths: version = csv_path.stem # header = ["Parent Type", "Property", "Type", "Description"] @@ -85,12 +90,32 @@ # Look for a similar existing item from a newer CodeMeta version for existing_item in json_items: if existing_item.items() >= item.items(): - # We found an existing item, add this version to its list + # We found an identical existing item, add this version to its list assert ( version not in existing_item["versions"] ), f"CodeMeta {version} has duplicated property {item}" existing_item["versions"].append(version) + # check for existing properties that have differing types or descriptions + # values from newer versions of properties_description.json take precedence + # over new ones. + # update the versions for these here and break to avoid duplicate rows + if item["Property"] == existing_item["Property"] and item["Parent Type"] == existing_item["Parent Type"]: + if canonicalize(item["Type"]) != canonicalize(existing_item["Type"]): + # both types meaningfully differ + item["versions"] = [version] + json_items.append(item) + else: + item["Type"] = existing_item["Type"] + if version not in existing_item["versions"]: + existing_item["versions"].append(version) + + if item["Description"] != existing_item["Description"] and item["Type"] == existing_item["Type"]: + item["Description"] = existing_item["Description"] + if version not in existing_item["versions"]: + existing_item["versions"].append(version) + break + else: # No similar item, create a new one item["versions"] = [version]