IndicoDataSolutions · mawelborn · Mar 14, 2025 · Feb 13, 2025 · Feb 13, 2025 · Feb 13, 2025
diff --git a/.github/workflows/python.yaml b/.github/workflows/python.yaml
@@ -8,61 +8,47 @@ on:
 
 jobs:
   Python:
-    runs-on: ubuntu-22.04
+    runs-on: ubuntu-24.04
 
     strategy:
       fail-fast: false
       matrix:
-        python-version: ["3.9", "3.10", "3.11"]
-
-    env:
-      API_TOKEN: ${{ secrets.TEST_API_TOKEN }}
-      DATASET_ID: ${{ secrets.DATASET_ID }}
-      PDF_DATASET_ID: ${{ secrets.PDF_DATASET_ID }}
-      MODEL_NAME: ${{ secrets.MODEL_NAME }}
-      WORKFLOW_ID: ${{ secrets.WORKFLOW_ID }}
-      MODEL_ID: ${{ secrets.MODEL_ID }} 
-      MODEL_GROUP_ID: ${{ secrets.MODEL_GROUP_ID }}
-      TEACH_TASK_ID: ${{ secrets.TEACH_TASK_ID }}
-      HOST_URL: try.indico.io
-
+        python-version: ["3.10", "3.11", "3.12", "3.13"]
+
     steps:
     - name: Checkout Commit
-      uses: actions/checkout@v3
+      uses: actions/checkout@v4
+
+    - name: Install Poetry
+      run: pipx install "poetry>=2,<3"
 
     - name: Install Python ${{ matrix.python-version }}
-      uses: actions/setup-python@v3
+      uses: actions/setup-python@v5
       with:
         python-version: ${{ matrix.python-version }}
+        cache: "poetry"
 
     - name: Install Dependencies
       run: |
-        python -m pip install --upgrade pip
-        python -m pip install -r requirements.txt
-        python -m pip install -e .[full]
-        python -m pip install flake8 pytest pytest-cov
+        poetry env use ${{ matrix.python-version }}
+        poetry install
 
-    - name: Run Tests And Build Coverage File
-      run: |
-        pytest --junitxml=pytest.xml --cov-report=term-missing:skip-covered --cov=indico_toolkit tests/ | tee pytest-coverage.txt
+    - name: Run Black
+      run: poetry run poe black-check
 
-    - name: Pytest Coverage Comment
-      id: coverageComment
-      uses: MishaKav/pytest-coverage-comment@main
-      with:
-        pytest-coverage-path: ./pytest-coverage.txt
-        junitxml-path: ./pytest.xml
-        title: Indico Toolkit Coverage Report
-        badge-title: Test Coverage
-        default-branch: main
+    - name: Run Ruff
+      run: poetry run poe ruff-check
 
-    - name: Check The Output Coverage
+    - name: Run Mypy
+      run: poetry run poe mypy
+
+    - name: Run Pytest
+      run: poetry run poe test
+
+    - name: Install Extra Dependencies
       run: |
-        echo "Coverage Percentage - ${{ steps.coverageComment.outputs.coverage }}"
-        echo "Coverage Warnings - ${{ steps.coverageComment.outputs.warnings }}"
-        echo "Coverage Errors - ${{ steps.coverageComment.outputs.errors }}"
-        echo "Coverage Failures - ${{ steps.coverageComment.outputs.failures }}"
-        echo "Coverage Skipped - ${{ steps.coverageComment.outputs.skipped }}"
-        echo "Coverage Tests - ${{ steps.coverageComment.outputs.tests }}"
-        echo "Coverage Time - ${{ steps.coverageComment.outputs.time }}"
-        echo "Not Success Test Info - ${{ steps.coverageComment.outputs.notSuccessTestInfo }}"
+        poetry env use ${{ matrix.python-version }}
+        poetry install --all-extras
+
+    - name: Run Pytest on Extras
+      run: poetry run poe test
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -141,9 +141,15 @@ This is the first major version release tested to work on Indico 6.X.
 
 * Small but important fix to add original filename to the workflow result object
 
-
 ## 6.1.0 5/6/24
 
 ### Removed
 
 * Removed staggered loop support and removed highlighting support.
+
+## 6.14.0 3/10/25
+
+* Added `results` module.
+* Added `etloutput` module.
+* Refactored `retry` decorator with asyncio support.
+* Switched to Poetry for packaging and dependency management.
diff --git a/README.md b/README.md
@@ -1,84 +1,109 @@
-# Indico-Toolkit
+# Indico Toolkit
 
-A library to assist Indico IPA development
+**This repository contains software that is not officially supported by Indico. It may
+  be outdated or contain bugs. The operations it performs are potentially destructive.
+  Use at your own risk.**
 
-### Available Functionality
+Classes, functions, and abstractions for building workflows using the Indico IPA
+(Intelligent Process Automation) platform.
 
-The indico-toolkit provides classes and functions to help achieve the following:
+- [Polling Classes](https://github.com/IndicoDataSolutions/indico-toolkit-python/tree/main/indico_toolkit/polling/__init__.py)
+  that implement best-practices polling behavior for Auto Review and Downstream
+  processes. Easily plug in business logic without the boilerplate.
+- [Result File](https://github.com/IndicoDataSolutions/indico-toolkit-python/blob/main/indico_toolkit/results/__init__.py)
+  and [Etl Output](https://github.com/IndicoDataSolutions/indico-toolkit-python/blob/main/indico_toolkit/etloutput/__init__.py)
+  Data Classes that parse standard IPA JSON output into idiomatic, type-safe Python dataclasses.
+- [Metrics Classes](https://github.com/IndicoDataSolutions/indico-toolkit-python/blob/main/indico_toolkit/metrics/__init__.py)
+  to compare model performance, evaluate ground truth, and plot statistics.
+- [Snapshot Classes](https://github.com/IndicoDataSolutions/indico-toolkit-python/blob/main/indico_toolkit/snapshots/snapshot.py)
+  to concatenate, merge, filter, and manipulate snapshot CSVs.
 
-* Easy batch workflow submission and retrieval.
-* Classes that simplify dataset/doc-extraction functionality.
-* Tools to assist with positioning, e.g. row association, distance between preds, relative position validation.
-* Tools to assist with creating and copying workflow structures.
-* Get metrics for all model IDs in a model group to see how well fields are performing after more labeling.
-* Compare two models via bar plot and data tables.
-* Train a document classification model without labeling.
-* An AutoReview class to assist with automated acceptance/rejection of model predictions.
-* Common manipulation of prediction/workflow results.
-* Objects to simplify parsing OCR responses.
-* Snapshot merging and manipulation
+...and more in the [Examples](https://github.com/IndicoDataSolutions/indico-toolkit-python/tree/main/examples) folder.
 
-### Installation
 
+## Installation
+
+**Indico Toolkit does not use semantic versioning.**
+
+Indico Toolkit versions match the minimum IPA version required to use its functionality.
+E.g. `indico-toolkit==6.14.0` makes use of functionality introduced in IPA 6.14, and
+some functionality requires IPA 6.14 or later to use.
+
+```bash
+pip install indico-toolkit
 ```
-pip install indico_toolkit
+
+Some functionality requires optional dependencies that can be installed with extras.
+
+```bash
+pip install 'indico-toolkit[all]'
+pip install 'indico-toolkit[downloads]'
+pip install 'indico-toolkit[examples]'
+pip install 'indico-toolkit[metrics]'
+pip install 'indico-toolkit[predictions]'
+pip install 'indico-toolkit[snapshots]'
 ```
 
-* Note: if you are on Indico 6.X, install an indico_toolkit 6.X version. If you're on 5.X install a 2.X version.
-* Note: If you are on a version of the Indico IPA platform pre-5.1, then install indico-toolkit==1.2.3.
 
-### Example Useage
+## Contributing
 
-For scripted examples on how to use the toolkit, see the [examples directory](https://github.com/IndicoDataSolutions/Indico-Solutions-Toolkit/tree/main/examples)
+Indico Toolkit uses Poetry 2.X for package and dependency management.
 
-### Tests
 
-To run the test suite you will need to set the following environment variables: HOST_URL, API_TOKEN_PATH.
-You can also set WORKFLOW_ID (workflow w/ single extraction model), MODEL_NAME (extraction model name)
-and DATASET_ID (uploaded dataset). If you don't set these 3 env variables, test configuration will
-upload a dataset and create a workflow.
+### Setup
 
-```
-pytest
+Clone the source repository with Git.
+
+```bash
+git clone [email protected]:IndicoDataSolutions/indico-toolkit-python.git
 ```
 
-### Example
+Install dependencies with Poetry.
 
-How to get prediction results and write the results to CSV
+```bash
+poetry install
+```
+
+Formatting, linting, type checking, and tests are defined as
+[Poe](https://poethepoet.natn.io/) tasks in `pyproject.toml`.
 
+```bash
+poetry run poe {format,check,test,all}
 ```
-from indico_toolkit.indico_wrapper import Workflow
-from indico_toolkit.pipelines import FileProcessing
-from indico_toolkit import create_client
 
-WORKFLOW_ID = 1418
-HOST = "app.indico.io"
-API_TOKEN_PATH = "./indico_api_token.txt"
+Code changes or additions should pass `poetry run poe all` before opening a PR.
+
+
+### Tests
+
+Indico Toolkit has three test suites: required unit tests, extra unit tests, and
+integration tests.
 
-# Instantiate the workflow class
-client = create_client(HOST, API_TOKEN_PATH)
-wflow = Workflow(client)
+By default, only required unit tests are executed. Extra unit tests and integration
+tests are skipped.
 
-# Collect files to submit
-fp = FileProcessing()
-fp.get_file_paths_from_dir("./datasets/disclosures/")
+```bash
+poetry run poe {test,all}
+```
 
-# Submit documents, await the results and write the results to CSV in batches of 10
-for paths in fp.batch_files(batch_size=10):
-    submission_ids = wflow.submit_documents_to_workflow(WORKFLOW_ID, paths)
-    submission_results = wflow.get_submission_results_from_ids(submission_ids)
-    for filename, result in zip(paths, submission_results):
-        result.predictions.to_csv("./results.csv", filename=filename, append_if_exists=True)
+Extra unit tests are skipped when their dependencies are not installed. To execute extra
+unit tests, install one or more extras and run the tests.
 
+```bash
+poetry install --all-extras
+poetry run poe {test,all}
 ```
 
-### Contributing
+Integration tests make API calls to an IPA environment and require a host and API token
+to execute. These tests create datasets, setup workflows, and train models. **Expect
+them to take tens of minutes to run.**
 
-If you are adding new features to Indico Toolkit, make sure to:
+```bash
+poetry run poe test-integration \
+    --host try.indico.io \
+    --token indico_api_token.txt
+```
 
-* Add robust integration and unit tests.
-* Add a sample usage script to the 'examples/' directory.
-* Add a bullet point for what the feature does to the list at the top of this README.md.
-* Ensure the full test suite is passing locally before creating a pull request.
-* Add doc strings for methods where usage is non-obvious.
-* If you are using new pip installed libraries, make sure they are added to the setup.py and pyproject.toml.
+Make liberal use of pytest's `--last-failed` and `--failed-first`
+[flags](https://docs.pytest.org/en/stable/how-to/cache.html) to speed up integration
+test execution when writing code.
diff --git a/examples/auto_review_predictions.py b/examples/auto_review_predictions.py
@@ -1,17 +1,14 @@
 """
 Submit documents to a workflow, auto review them and submit them for human review
 """
-from indico_toolkit.auto_review import (
-    AutoReviewFunction,
-    AutoReviewer,
-)
+
+from indico_toolkit import create_client
+from indico_toolkit.auto_review import AutoReviewer, AutoReviewFunction
 from indico_toolkit.auto_review.auto_review_functions import (
+    accept_by_confidence,
     remove_by_confidence,
-    accept_by_confidence
 )
 from indico_toolkit.indico_wrapper import Workflow
-from indico_toolkit import create_client
-
 
 WORKFLOW_ID = 1234
 HOST = "app.indico.io"
@@ -29,6 +26,7 @@
 wf_results = wflow.get_submission_results_from_ids(submission_ids)
 predictions = wf_results[0].predictions.to_list()
 
+
 # Set up custom review function
 def custom_function(predictions, labels: list = None, match_text: str = ""):
     for pred in predictions:
@@ -39,9 +37,13 @@ def custom_function(predictions, labels: list = None, match_text: str = ""):
 
 # Set up review functions and review predictions
 functions = [
-    AutoReviewFunction(remove_by_confidence, kwargs={"conf_threshold": 0.90}), # will default to all labels if labels is not provided
+    AutoReviewFunction(
+        remove_by_confidence, kwargs={"conf_threshold": 0.90}
+    ),  # will default to all labels if labels is not provided
     AutoReviewFunction(accept_by_confidence, labels=["Name", "Amount"]),
-    AutoReviewFunction(custom_function, kwargs={"match_text": "text to match"}) # call custom auto review function 
+    AutoReviewFunction(
+        custom_function, kwargs={"match_text": "text to match"}
+    ),  # call custom auto review function
 ]
 auto_reviewer = AutoReviewer(predictions, functions)
 auto_reviewer.apply_reviews()
@@ -50,4 +52,3 @@ def custom_function(predictions, labels: list = None, match_text: str = ""):
 wflow.submit_submission_review(
     submission_ids[0], {MODEL_NAME: auto_reviewer.updated_predictions}
 )
-
diff --git a/examples/copy_teach_task.py b/examples/copy_teach_task.py
@@ -16,5 +16,5 @@
 new_workflow = auto_populator.copy_teach_task(
     dataset_id=DATASET_ID,
     teach_task_id=TEACH_TASK_ID,
-    workflow_name=f"Copied Workflow",
+    workflow_name="Copied Workflow",
 )
diff --git a/examples/create_auto_classification_workflow.py b/examples/create_auto_classification_workflow.py
@@ -2,9 +2,8 @@
 from indico_toolkit.auto_populate import AutoPopulator
 
 """
-Create an Indico Classification Workflow without any labeling 
-using an organized directory/folder structure. Each folder/directory should contain only one file 
-type.
+Create an Indico Classification Workflow without any labeling using an organized
+directory/folder structure. Each folder/directory should contain only one file type.
 
 For example, you would target '/base_directory/' if you had your files organized like:
 
@@ -25,4 +24,4 @@
     "My dataset",
     "My workflow",
     "My teach task",
-)
+)
diff --git a/examples/create_full_structure.py b/examples/create_full_structure.py
@@ -24,7 +24,7 @@
     files_to_upload=["./path_to_file"],
     read_api=True,
     single_column=False,
-    **optional_ocr_options
+    **optional_ocr_options,
 )
 
 # creates workflow

diff --git a/examples/dataset_tasks.py b/examples/dataset_tasks.py
@@ -1,6 +1,6 @@
+from indico_toolkit import create_client
 from indico_toolkit.indico_wrapper import Datasets, Download
 from indico_toolkit.pipelines import FileProcessing
-from indico_toolkit import create_client
 
 DATASET_ID = 1234
 HOST = "app.indico.io"
@@ -23,4 +23,3 @@
 for paths in fp.batch_files(batch_size=2):
     datasets.add_files_to_dataset(paths)
     print(f"Uploaded {len(paths)} files")
-