diff --git a/CHANGELOG.md b/CHANGELOG.md index 5a2c78e6..68f5958a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -31,6 +31,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. - AddHCATMixin assures hcat-extension validity and csv-based data-conversion if required - EuroCropsConverterMixin is a BaseClass for EuroCrops-provided datasets - EuroLandBaseConverter is a BaseClass for Euroland-provided datasets +- Addes some converters that contain splits for ML usecases - Avoid base property schema override - Add Converter for Bulgaria - Remove unintended CommonMark formatting (indentation) from descriptions in converters diff --git a/fiboa_cli/datasets/ai4sf.py b/fiboa_cli/datasets/ai4sf.py index 4f043fa8..0296e18e 100644 --- a/fiboa_cli/datasets/ai4sf.py +++ b/fiboa_cli/datasets/ai4sf.py @@ -3,7 +3,7 @@ from ..conversion.fiboa_converter import FiboaBaseConverter -class Converter(FiboaBaseConverter): +class Ai4SfConverter(FiboaBaseConverter): sources = { # Cambodia "https://phys-techsciences.datastations.nl/api/access/datafile/100634?gbrecs=true": "2_cambodia_areas.gpkg", @@ -72,7 +72,7 @@ class Converter(FiboaBaseConverter): } id = "ai4sf" - short_name = "Cambodia/Vietnam (AI4SmallFarms)" + short_name = "Cambodia/Vietnam (AI4SF)" title = "Field boundaries for Cambodia and Vietnam (AI4SmallFarms)" # from https://research.tudelft.nl/en/publications/ai4smallfarms-a-dataset-for-crop-field-delineation-in-southeast-a description = """ diff --git a/fiboa_cli/datasets/ai4sf_ml.py b/fiboa_cli/datasets/ai4sf_ml.py new file mode 100644 index 00000000..2f2aafb5 --- /dev/null +++ b/fiboa_cli/datasets/ai4sf_ml.py @@ -0,0 +1,20 @@ +from .ai4sf import Ai4SfConverter +from .commons.ml_splits import MlSplitsMixin + + +class Ai4SfMlConverter(MlSplitsMixin, Ai4SfConverter): + + def migrate(self, gdf): + # Download file with splits + urls = { + "https://phys-techsciences.datastations.nl/api/access/datafile/100418?gbrecs=true": "tiles_asia.gpkg", + } + paths = self.download_files(urls, self.cache) + tiles = self.read_data(paths, **self.open_options) + + # Add splits + splits = tiles[["id", "country", "split"]].drop_duplicates(subset=["id", "country"]) + gdf = gdf.merge(splits, on=["id", "country"], how="left") + gdf["split"] = gdf["split"].replace({"validate": "val"}) + + return super().migrate(gdf) diff --git a/fiboa_cli/datasets/commons/ml_splits.py b/fiboa_cli/datasets/commons/ml_splits.py new file mode 100644 index 00000000..707fd907 --- /dev/null +++ b/fiboa_cli/datasets/commons/ml_splits.py @@ -0,0 +1,26 @@ +class MlSplitsMixin: + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + self.cache = None + + self.id = self.id + "_ml" + self.title += " with splits" + self.short_name += " with splits" + + self.columns["split"] = "split" + + if "required" not in self.missing_schemas: + self.missing_schemas["required"] = [] + self.missing_schemas["required"].append("split") + if "properties" not in self.missing_schemas: + self.missing_schemas["properties"] = {} + self.missing_schemas["properties"]["split"] = { + "type": "string", + "enum": ["train", "val", "test"], + } + + def download_files(self, uris, cache_folder=None, **kwargs): + # Store cache folder for later use in migrate + self.cache = cache_folder + return super().download_files(uris, cache_folder, **kwargs)