From 69fe564dcd5633f1d5a9f158a5fd86790ee809be Mon Sep 17 00:00:00 2001 From: timburman Date: Fri, 12 Jun 2026 13:57:46 +0530 Subject: [PATCH 1/8] build: add markitdown-dicom package boilerplate and entry points --- packages/markitdown-dicom/.gitignore | 1 + packages/markitdown-dicom/pyproject.toml | 68 +++++++++++++++++++ .../src/markitdown_dicom/__about__.py | 4 ++ .../src/markitdown_dicom/__init__.py | 14 ++++ .../src/markitdown_dicom/_plugin.py | 11 +++ .../src/markitdown_dicom/py.typed | 1 + 6 files changed, 99 insertions(+) create mode 100644 packages/markitdown-dicom/.gitignore create mode 100644 packages/markitdown-dicom/pyproject.toml create mode 100644 packages/markitdown-dicom/src/markitdown_dicom/__about__.py create mode 100644 packages/markitdown-dicom/src/markitdown_dicom/__init__.py create mode 100644 packages/markitdown-dicom/src/markitdown_dicom/_plugin.py create mode 100644 packages/markitdown-dicom/src/markitdown_dicom/py.typed diff --git a/packages/markitdown-dicom/.gitignore b/packages/markitdown-dicom/.gitignore new file mode 100644 index 000000000..571830800 --- /dev/null +++ b/packages/markitdown-dicom/.gitignore @@ -0,0 +1 @@ +tests/test-dicom-files/ diff --git a/packages/markitdown-dicom/pyproject.toml b/packages/markitdown-dicom/pyproject.toml new file mode 100644 index 000000000..8c2687539 --- /dev/null +++ b/packages/markitdown-dicom/pyproject.toml @@ -0,0 +1,68 @@ +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[project] +name = "markitdown-dicom" +dynamic = ["version"] +description = 'DICOM converter plugin for MarkItDown - Extracts metadata from .dcm files' +readme = "README.md" +requires-python = ">=3.10" +license = "MIT" +keywords = ["markitdown", "dicom", "metadata", "pydicom"] +authors = [ + { name = "Aryan Kaushik", email = "aryankaushik251@gmail.com" }, +] +classifiers = [ + "Development Status :: 4 - Beta", + "Programming Language :: Python", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Programming Language :: Python :: 3.13", + "Programming Language :: Python :: Implementation :: CPython", +] +dependencies = [ + "markitdown>=0.1.0a1", + "pydicom>=2.4.0", +] + +[project.urls] +Documentation = "https://github.com/microsoft/markitdown#readme" +Issues = "https://github.com/microsoft/markitdown/issues" +Source = "https://github.com/microsoft/markitdown" + +[tool.hatch.version] +path = "src/markitdown_dicom/__about__.py" + +[project.entry-points."markitdown.plugin"] +dicom = "markitdown_dicom" + +[tool.hatch.envs.types] +extra-dependencies = [ + "mypy>=1.0.0", +] +[tool.hatch.envs.types.scripts] +check = "mypy --install-types --non-interactive {args:src/markitdown_dicom tests}" + +[tool.coverage.run] +source_pkgs = ["markitdown_dicom", "tests"] +branch = true +parallel = true +omit = [ + "src/markitdown_dicom/__about__.py", +] + +[tool.coverage.paths] +markitdown-dicom = ["src/markitdown_dicom", "*/markitdown-dicom/src/markitdown_dicom"] +tests = ["tests", "*/markitdown-dicom/tests"] + +[tool.coverage.report] +exclude_lines = [ + "no cov", + "if __name__ == .__main__.:", + "if TYPE_CHECKING:", +] + +[tool.hatch.build.targets.sdist] +only-include = ["src/markitdown_dicom"] diff --git a/packages/markitdown-dicom/src/markitdown_dicom/__about__.py b/packages/markitdown-dicom/src/markitdown_dicom/__about__.py new file mode 100644 index 000000000..24f8ff955 --- /dev/null +++ b/packages/markitdown-dicom/src/markitdown_dicom/__about__.py @@ -0,0 +1,4 @@ +# SPDX-FileCopyrightText: 2026-present Aryan Kaushik +# +# SPDX-License-Identifier: MIT +__version__ = "0.1.0a1" diff --git a/packages/markitdown-dicom/src/markitdown_dicom/__init__.py b/packages/markitdown-dicom/src/markitdown_dicom/__init__.py new file mode 100644 index 000000000..1da335c88 --- /dev/null +++ b/packages/markitdown-dicom/src/markitdown_dicom/__init__.py @@ -0,0 +1,14 @@ +# SPDX-FileCopyrightText: 2026-present Aryan Kaushik +# +# SPDX-License-Identifier: MIT + +from ._plugin import __plugin_interface_version__, register_converters +from ._dicom_converter import DicomConverter +from .__about__ import __version__ + +__all__ = [ + "__version__", + "__plugin_interface_version__", + "register_converters", + "DicomConverter", +] diff --git a/packages/markitdown-dicom/src/markitdown_dicom/_plugin.py b/packages/markitdown-dicom/src/markitdown_dicom/_plugin.py new file mode 100644 index 000000000..4106b49db --- /dev/null +++ b/packages/markitdown-dicom/src/markitdown_dicom/_plugin.py @@ -0,0 +1,11 @@ +from typing import Any +from markitdown import MarkItDown +from ._dicom_converter import DicomConverter + +__plugin_interface_version__ = 1 + +def register_converters(markitdown: MarkItDown, **kwargs: Any) -> None: + """ + Called during construction of MarkItDown instances to register converters provided by plugins. + """ + markitdown.register_converter(DicomConverter(**kwargs)) diff --git a/packages/markitdown-dicom/src/markitdown_dicom/py.typed b/packages/markitdown-dicom/src/markitdown_dicom/py.typed new file mode 100644 index 000000000..7632ecf77 --- /dev/null +++ b/packages/markitdown-dicom/src/markitdown_dicom/py.typed @@ -0,0 +1 @@ +# Marker file for PEP 561 From 61be04341c7266389925c03a3469df7a100a1690 Mon Sep 17 00:00:00 2001 From: timburman Date: Fri, 12 Jun 2026 13:58:03 +0530 Subject: [PATCH 2/8] feat: implement DicomConverter with accepts and metadata formatting --- .../src/markitdown_dicom/_dicom_converter.py | 300 ++++++++++++++++++ 1 file changed, 300 insertions(+) create mode 100644 packages/markitdown-dicom/src/markitdown_dicom/_dicom_converter.py diff --git a/packages/markitdown-dicom/src/markitdown_dicom/_dicom_converter.py b/packages/markitdown-dicom/src/markitdown_dicom/_dicom_converter.py new file mode 100644 index 000000000..35cc2c22b --- /dev/null +++ b/packages/markitdown-dicom/src/markitdown_dicom/_dicom_converter.py @@ -0,0 +1,300 @@ +# SPDX-FileCopyrightText: 2026-present Aryan Kaushik +# +# SPDX-License-Identifier: MIT + +import re +import sys +from typing import Any, BinaryIO, Dict, List, Optional + +from markitdown import DocumentConverter, DocumentConverterResult, StreamInfo, MissingDependencyException + +# Lazy loading of pydicom to raise MissingDependencyException during conversion if not installed. +_dependency_exc_info = None +try: + import pydicom +except ImportError: + _dependency_exc_info = sys.exc_info() + + +class DicomConverter(DocumentConverter): + """ + Converts DICOM (.dcm) files to structured, token-efficient Markdown. + Extracts key Study, Series, Acquisition, Equipment, and Image characteristics. + Omits and redacts Patient PII (Name, ID, Birth Date) by default. + """ + + def __init__(self, redact_pii: bool = True, **kwargs: Any): + super().__init__() + self._redact_pii = redact_pii + + def accepts( + self, + file_stream: BinaryIO, + stream_info: StreamInfo, + **kwargs: Any, + ) -> bool: + mimetype = (stream_info.mimetype or "").lower() + extension = (stream_info.extension or "").lower() + + # Check standard extension / MIME type + if extension in (".dcm", ".dicom") or mimetype == "application/dicom": + return True + + # Peek at stream to check signature 'DICM' at offset 128 + cur_pos = file_stream.tell() + try: + file_stream.seek(128) + sig = file_stream.read(4) + if sig == b"DICM": + return True + except Exception: + pass + finally: + file_stream.seek(cur_pos) + + return False + + def convert( + self, + file_stream: BinaryIO, + stream_info: StreamInfo, + **kwargs: Any, + ) -> DocumentConverterResult: + # Check if pydicom is available + if _dependency_exc_info is not None: + raise MissingDependencyException( + "markitdown-dicom requires pydicom to be installed. " + "To resolve, run: pip install pydicom" + ) from _dependency_exc_info[1].with_traceback(_dependency_exc_info[2]) # type: ignore + + # Resolve redact_pii setting (defaulting to True) + redact_pii = kwargs.get("redact_pii", self._redact_pii) + + # Parse DICOM from the stream. + # Use defer_size="1 KB" so we don't load large pixel data arrays into memory. + # force=True allows parsing datasets without file meta header. + try: + ds = pydicom.dcmread(file_stream, defer_size="1 KB", force=True) + if ds is None or len(ds) == 0: + raise ValueError("Parsed dataset has no elements.") + except Exception as e: + raise ValueError(f"Failed to parse DICOM file: {e}") from e + + # Extracted elements + lines = ["# DICOM File", ""] + + # Date and Time Formatter helpers + def _format_date(val: Any) -> Optional[str]: + if not val: + return None + val_str = str(val).strip() + if len(val_str) == 8 and val_str.isdigit(): + return f"{val_str[0:4]}-{val_str[4:6]}-{val_str[6:8]}" + return val_str + + def _format_time(val: Any) -> Optional[str]: + if not val: + return None + val_str = str(val).strip() + if "." in val_str: + time_part, frac_part = val_str.split(".", 1) + else: + time_part, frac_part = val_str, "" + + if len(time_part) >= 6 and time_part.isdigit(): + formatted = f"{time_part[0:2]}:{time_part[2:4]}:{time_part[4:6]}" + if frac_part: + formatted += f".{frac_part}" + return formatted + elif len(time_part) >= 4 and time_part.isdigit(): + formatted = f"{time_part[0:2]}:{time_part[2:4]}" + if frac_part: + formatted += f".{frac_part}" + return formatted + return val_str + + def _get_val(keyword: str) -> Any: + val = getattr(ds, keyword, None) + if val is None: + return None + if isinstance(val, (list, tuple)) or type(val).__name__ == "MultiValue": + return ", ".join(str(x) for x in val) + return val + + # Define category structures + # 1. Patient Information + p_name = _get_val("PatientName") + p_id = _get_val("PatientID") + p_dob = _get_val("PatientBirthDate") + + if redact_pii: + p_name = "[REDACTED]" if p_name is not None else None + p_id = "[REDACTED]" if p_id is not None else None + p_dob = "[REDACTED]" if p_dob is not None else None + else: + if p_name: + p_name = str(p_name).replace("^", " ").strip() + + patient_fields = { + "Patient Name": p_name, + "Patient ID": p_id, + "Patient Birth Date": _format_date(p_dob), + "Patient Sex": _get_val("PatientSex"), + "Patient Age": _get_val("PatientAge"), + } + + # 2. Study Information + study_fields = { + "Study Instance UID": _get_val("StudyInstanceUID"), + "Study Date": _format_date(_get_val("StudyDate")), + "Study Time": _format_time(_get_val("StudyTime")), + "Study Description": _get_val("StudyDescription"), + "Accession Number": _get_val("AccessionNumber"), + } + + # 3. Series Information + series_fields = { + "Series Instance UID": _get_val("SeriesInstanceUID"), + "Series Number": _get_val("SeriesNumber"), + "Series Description": _get_val("SeriesDescription"), + } + + # 4. Acquisition Information + acquisition_fields = { + "Modality": _get_val("Modality"), + "Protocol Name": _get_val("ProtocolName"), + "Exposure": _get_val("Exposure"), + "Exposure Time": _get_val("ExposureTime"), + "KVP": _get_val("KVP"), + "Acquisition Date": _format_date(_get_val("AcquisitionDate")), + "Acquisition Time": _format_time(_get_val("AcquisitionTime")), + } + + # 5. Equipment Information + equipment_fields = { + "Manufacturer": _get_val("Manufacturer"), + "Manufacturer Model Name": _get_val("ManufacturerModelName"), + "Device Serial Number": _get_val("DeviceSerialNumber"), + "Software Versions": _get_val("SoftwareVersions"), + } + + # 6. Image Characteristics + rows = _get_val("Rows") + cols = _get_val("Columns") + resolution = f"{rows} × {cols}" if rows and cols else None + + pixel_data_present = "Yes" if (0x7FE0, 0x0010) in ds else "No" + + image_fields = { + "Resolution": resolution, + "Samples Per Pixel": _get_val("SamplesPerPixel"), + "Bits Allocated": _get_val("BitsAllocated"), + "Bits Stored": _get_val("BitsStored"), + "High Bit": _get_val("HighBit"), + "Pixel Representation": _get_val("PixelRepresentation"), + "Photometric Interpretation": _get_val("PhotometricInterpretation"), + "Frame Count": _get_val("NumberOfFrames"), + "Pixel Data Present": pixel_data_present, + } + + # 7. Other Useful Text Fields + other_fields = { + "Image Comments": _get_val("ImageComments"), + "Institution Name": _get_val("InstitutionName"), + "Station Name": _get_val("StationName"), + "Body Part Examined": _get_val("BodyPartExamined"), + } + + # Helper to render sections + def _render_section(title: str, fields: Dict[str, Any]) -> List[str]: + active = {k: v for k, v in fields.items() if v is not None and str(v).strip() != ""} + if not active: + return [] + sec_lines = [f"## {title}", ""] + for k, v in active.items(): + sec_lines.append(f"* **{k}**: {v}") + sec_lines.append("") + return sec_lines + + # Predefined sections + lines.extend(_render_section("Patient Information", patient_fields)) + lines.extend(_render_section("Study Information", study_fields)) + lines.extend(_render_section("Series Information", series_fields)) + lines.extend(_render_section("Acquisition Parameters", acquisition_fields)) + lines.extend(_render_section("Equipment", equipment_fields)) + lines.extend(_render_section("Image Properties", image_fields)) + lines.extend(_render_section("Other Information", other_fields)) + + # 8. Private / Custom textual tags when reasonable + EXCLUDED_KEYWORDS = { + # Study + "StudyInstanceUID", "StudyDate", "StudyTime", "StudyDescription", "AccessionNumber", + # Series + "SeriesInstanceUID", "SeriesNumber", "SeriesDescription", + # Acquisition + "Modality", "ProtocolName", "Exposure", "ExposureTime", "KVP", "AcquisitionDate", "AcquisitionTime", + # Equipment + "Manufacturer", "ManufacturerModelName", "DeviceSerialNumber", "SoftwareVersions", + # Image Characteristics + "Rows", "Columns", "SamplesPerPixel", "BitsAllocated", "BitsStored", "HighBit", "PixelRepresentation", "PhotometricInterpretation", "NumberOfFrames", + # Other Useful Text Fields + "ImageComments", "InstitutionName", "StationName", "BodyPartExamined", + # Patient info + "PatientName", "PatientID", "PatientBirthDate", "PatientSex", "PatientAge" + } + EXCLUDED_VRS = {"OB", "OW", "OF", "OD", "SQ", "UN"} + + custom_fields: Dict[str, str] = {} + for elem in ds: + # Skip file meta or pixel group + if elem.tag.group in (0x0002, 0x7FE0) or elem.tag.element == 0: + continue + + # Skip binary, sequence, or unknown VRs + if elem.VR in EXCLUDED_VRS: + continue + + keyword = elem.keyword + if not keyword: + if elem.tag.is_private: + label = f"Private Tag ({elem.tag.group:04X},{elem.tag.element:04X})" + else: + label = f"Tag ({elem.tag.group:04X},{elem.tag.element:04X})" + else: + if keyword in EXCLUDED_KEYWORDS: + continue + # Split CamelCase to separate words + label = re.sub(r'(? Date: Fri, 12 Jun 2026 13:58:10 +0530 Subject: [PATCH 3/8] test: add deterministic in-memory unit tests for DicomConverter --- packages/markitdown-dicom/tests/__init__.py | 3 + .../tests/test_dicom_converter.py | 240 ++++++++++++++++++ 2 files changed, 243 insertions(+) create mode 100644 packages/markitdown-dicom/tests/__init__.py create mode 100644 packages/markitdown-dicom/tests/test_dicom_converter.py diff --git a/packages/markitdown-dicom/tests/__init__.py b/packages/markitdown-dicom/tests/__init__.py new file mode 100644 index 000000000..aa8931747 --- /dev/null +++ b/packages/markitdown-dicom/tests/__init__.py @@ -0,0 +1,3 @@ +# SPDX-FileCopyrightText: 2026-present Aryan Kaushik +# +# SPDX-License-Identifier: MIT diff --git a/packages/markitdown-dicom/tests/test_dicom_converter.py b/packages/markitdown-dicom/tests/test_dicom_converter.py new file mode 100644 index 000000000..51f1fba33 --- /dev/null +++ b/packages/markitdown-dicom/tests/test_dicom_converter.py @@ -0,0 +1,240 @@ +# SPDX-FileCopyrightText: 2026-present Aryan Kaushik +# +# SPDX-License-Identifier: MIT + +import io +import pytest +from typing import Dict, Any, Optional + +import pydicom +from pydicom.dataset import FileDataset, FileMetaDataset +from pydicom.uid import ExplicitVRLittleEndian, generate_uid + +from markitdown import MarkItDown, StreamInfo +from markitdown_dicom import DicomConverter + + +def create_mock_dicom( + patient_name: Optional[str] = "Test^Patient", + patient_id: Optional[str] = "123456", + patient_dob: Optional[str] = "19800101", + modality: str = "CT", + study_description: str = "Mock Study", + rows: Optional[int] = 512, + cols: Optional[int] = 512, + has_pixel_data: bool = True, + extra_fields: Optional[Dict[str, Any]] = None, +) -> io.BytesIO: + """Helper to programmatically generate a valid DICOM file in memory.""" + file_meta = FileMetaDataset() + file_meta.MediaStorageSOPClassUID = "1.2.840.10008.5.1.4.1.1.2" # CT Image Storage + file_meta.MediaStorageSOPInstanceUID = generate_uid() + file_meta.TransferSyntaxUID = ExplicitVRLittleEndian + file_meta.ImplementationClassUID = pydicom.uid.PYDICOM_IMPLEMENTATION_UID + + ds = FileDataset("in_memory.dcm", {}, file_meta=file_meta, preamble=b"\0" * 128) + + if patient_name is not None: + ds.PatientName = patient_name + if patient_id is not None: + ds.PatientID = patient_id + if patient_dob is not None: + ds.PatientBirthDate = patient_dob + + ds.PatientSex = "M" + ds.PatientAge = "045Y" + ds.Modality = modality + ds.StudyInstanceUID = generate_uid() + ds.SeriesInstanceUID = generate_uid() + ds.SOPInstanceUID = file_meta.MediaStorageSOPInstanceUID + ds.SOPClassUID = file_meta.MediaStorageSOPClassUID + ds.StudyDate = "20260612" + ds.StudyTime = "120000.123" + ds.StudyDescription = study_description + ds.AccessionNumber = "ACC-12345" + ds.SeriesNumber = 1 + ds.SeriesDescription = "PA View" + ds.Manufacturer = "GE Medical Systems" + + if rows is not None: + ds.Rows = rows + if cols is not None: + ds.Columns = cols + + ds.SamplesPerPixel = 1 + ds.BitsAllocated = 16 + ds.BitsStored = 16 + ds.HighBit = 15 + ds.PixelRepresentation = 0 + ds.PhotometricInterpretation = "MONOCHROME2" + + if has_pixel_data: + # Use simple dummy bytes for pixel data + ds.PixelData = b"\x00" * 100 + + if extra_fields: + for keyword, val in extra_fields.items(): + setattr(ds, keyword, val) + + buffer = io.BytesIO() + ds.save_as(buffer, enforce_file_format=False) + buffer.seek(0) + return buffer + + +def test_dicom_converter_accepts() -> None: + """Verifies that the DicomConverter accepts DICOM streams using metadata or signature checks.""" + converter = DicomConverter() + + # Case 1: Acceptance by extension + assert converter.accepts( + io.BytesIO(b""), + StreamInfo(extension=".dcm"), + ) + assert converter.accepts( + io.BytesIO(b""), + StreamInfo(extension=".dicom"), + ) + + # Case 2: Acceptance by MIME type + assert converter.accepts( + io.BytesIO(b""), + StreamInfo(mimetype="application/dicom"), + ) + + # Case 3: Acceptance by peeking at DICM signature at offset 128 + mock_dicom = create_mock_dicom() + assert converter.accepts( + mock_dicom, + StreamInfo(extension=".raw"), # Wrong extension, but valid stream + ) + + # Case 4: Rejection of non-DICOM content + assert not converter.accepts( + io.BytesIO(b"\x00" * 200), + StreamInfo(extension=".txt"), + ) + + +def test_dicom_converter_default_redaction() -> None: + """Tests that by default, patient identifying details are redacted but clinical demographics are kept.""" + converter = DicomConverter() + stream = create_mock_dicom( + patient_name="Doe^John", + patient_id="PID-999", + patient_dob="19750505", + ) + + result = converter.convert(stream, StreamInfo()) + + # PatientName, PatientID, and PatientBirthDate must be redacted + assert "Doe John" not in result.markdown + assert "PID-999" not in result.markdown + assert "1975-05-05" not in result.markdown + assert "Patient Name**: [REDACTED]" in result.markdown + assert "Patient ID**: [REDACTED]" in result.markdown + assert "Patient Birth Date**: [REDACTED]" in result.markdown + + # Patient Sex and Age should remain as clinical metadata + assert "Patient Sex**: M" in result.markdown + assert "Patient Age**: 045Y" in result.markdown + + # Verifying other standard sections are rendered properly + assert "Study Description**: Mock Study" in result.markdown + assert "Resolution**: 512 × 512" in result.markdown + assert "Study Date**: 2026-06-12" in result.markdown + assert "Study Time**: 12:00:00.123" in result.markdown + + +def test_dicom_converter_disabled_redaction() -> None: + """Tests that when redact_pii is set to False, identifiers are extracted normally.""" + converter = DicomConverter(redact_pii=False) + stream = create_mock_dicom( + patient_name="Doe^John", + patient_id="PID-999", + patient_dob="19750505", + ) + + result = converter.convert(stream, StreamInfo()) + + assert "Patient Name**: Doe John" in result.markdown + assert "Patient ID**: PID-999" in result.markdown + assert "Patient Birth Date**: 1975-05-05" in result.markdown + + +def test_dicom_converter_missing_fields() -> None: + """Verifies that missing optional tags do not raise exceptions and are simply omitted.""" + converter = DicomConverter() + # Create DICOM file with no manufacturer, resolution, or description + stream = create_mock_dicom( + study_description="", + rows=None, + cols=None, + ) + + result = converter.convert(stream, StreamInfo()) + + # Ensure no empty field or error occurs + assert "Resolution" not in result.markdown + assert "Study Description" not in result.markdown + assert "Manufacturer**" in result.markdown # Manufacturer remains since it wasn't set to None + assert "DICOM File" in result.markdown + + +def test_dicom_converter_custom_and_private_tags() -> None: + """Verifies that extra textual/numeric tags and private tags are formatted correctly.""" + converter = DicomConverter() + + # Add custom standard tags (e.g. BodyPartExamined, InstitutionName) and a private tag + # Private tags use odd group numbers, e.g., 0x0009 + extra_fields = { + "InstitutionName": "Central Hospital", + "BodyPartExamined": "CHEST", + "InstitutionAddress": "123 Clinic Rd", + } + stream = create_mock_dicom(extra_fields=extra_fields) + + # Let's add a raw private tag directly to the dataset + ds = pydicom.dcmread(stream, force=True) + # Register private creator block + ds.private_block(0x0009, "Mock Creator", create=True) + # Add a private element in group 0x0009 + ds[0x0009, 0x1001] = pydicom.dataelem.DataElement(0x00091001, "LO", "Mock Private Value") + + # Save modified dataset to a new stream + new_stream = io.BytesIO() + ds.save_as(new_stream) + new_stream.seek(0) + + result = converter.convert(new_stream, StreamInfo()) + + # Verify standard custom fields + assert "Institution Name**: Central Hospital" in result.markdown + assert "Body Part Examined**: CHEST" in result.markdown + + # Verify additional standard fields split camelcase + assert "Institution Address**: 123 Clinic Rd" in result.markdown + + # Verify private tag rendering + assert "Private Tag (0009,1001)**: Mock Private Value" in result.markdown + + +def test_markitdown_plugin_integration() -> None: + """Tests that MarkItDown loads and uses the DicomConverter when enable_plugins is True.""" + md = MarkItDown(enable_plugins=True) + stream = create_mock_dicom(study_description="Integration Test") + + # Convert using the file stream with hint + result = md.convert(stream, stream_info=StreamInfo(extension=".dcm")) + + assert "Study Description**: Integration Test" in result.markdown + assert "Patient Name**: [REDACTED]" in result.markdown + + +def test_corrupted_dicom() -> None: + """Verifies that a corrupted DICOM stream raises ValueError during conversion.""" + converter = DicomConverter() + corrupt_stream = io.BytesIO(b"DICM" + b"\xff" * 100) + + with pytest.raises(ValueError, match="Failed to parse DICOM file"): + converter.convert(corrupt_stream, StreamInfo()) From 9a0dd91bffb43feb0610c3a1804ba21341ec4050 Mon Sep 17 00:00:00 2001 From: timburman Date: Fri, 12 Jun 2026 13:58:17 +0530 Subject: [PATCH 4/8] docs: add installation and usage guide in markitdown-dicom README --- packages/markitdown-dicom/README.md | 112 ++++++++++++++++++++++++++++ 1 file changed, 112 insertions(+) create mode 100644 packages/markitdown-dicom/README.md diff --git a/packages/markitdown-dicom/README.md b/packages/markitdown-dicom/README.md new file mode 100644 index 000000000..a30bc8d6b --- /dev/null +++ b/packages/markitdown-dicom/README.md @@ -0,0 +1,112 @@ +# MarkItDown DICOM Plugin (`markitdown-dicom`) + +This is a plugin for [MarkItDown](https://github.com/microsoft/markitdown) that adds support for converting DICOM (`.dcm`) files into LLM-friendly Markdown metadata representations. + +The plugin is designed to be highly memory-efficient (using deferred loading for pixel data) and token-efficient, ignoring raw pixel arrays while extracting clinically-relevant metadata. + +## Features + +- **Efficient Stream Peeking**: Fast detection of `.dcm` files by peeking at the `DICM` file preamble/magic bytes at offset 128. +- **Memory Safety**: Uses `pydicom` with deferred value loading (`defer_size="1 KB"`) to parse headers of large multi-frame DICOM files without loading gigabytes of pixel data. +- **HIPAA-Compliant by Default**: Automatically redacts Patient Name, Patient ID, and Patient Birth Date. +- **Formatted Metadata**: Standardizes dates to `YYYY-MM-DD` and times to `HH:MM:SS` for downstream RAG and vector database ingestion. +- **Custom Tag Support**: Automatically extracts and formats additional standard and private vendor tags if they are simple numbers/text. + +## Installation + +Install the plugin along with MarkItDown: + +```bash +pip install markitdown-dicom +``` + +## Usage + +### Command Line Interface + +Use the `-p` (or `--use-plugins`) option to enable third-party plugins: + +```bash +markitdown --use-plugins patient_scan.dcm -o patient_scan.md +``` + +### Python API + +```python +from markitdown import MarkItDown + +# Initialize MarkItDown with plugins enabled +md = MarkItDown(enable_plugins=True) + +# Convert a DICOM file +result = md.convert("patient_scan.dcm") +print(result.text_content) +``` + +### Disabling PII Redaction + +If you are working in a fully de-identified or secure clinical environment and want to retain Patient Name and Patient ID, you can disable redaction: + +```python +from markitdown import MarkItDown + +md = MarkItDown(enable_plugins=True, redact_pii=False) +result = md.convert("patient_scan.dcm") +``` + +## Example Output + +```markdown +# DICOM File + +## Patient Information + +* **Patient Name**: [REDACTED] +* **Patient ID**: [REDACTED] +* **Patient Birth Date**: [REDACTED] +* **Patient Sex**: M +* **Patient Age**: 045Y + +## Study Information + +* **Study Instance UID**: 1.2.840.113619.2.134.1.20230612.98765432 +* **Study Date**: 2023-06-12 +* **Study Time**: 11:44:27 +* **Study Description**: Chest X-Ray +* **Accession Number**: ACC-98765 + +## Series Information + +* **Series Instance UID**: 1.2.840.113619.2.134.2.20230612.98765432 +* **Series Number**: 1 +* **Series Description**: PA View + +## Acquisition Parameters + +* **Modality**: DX +* **Protocol Name**: Chest PA +* **Exposure**: 2 +* **Exposure Time**: 10 +* **KVP**: 120 +* **Acquisition Date**: 2023-06-12 +* **Acquisition Time**: 11:45:00 + +## Equipment + +* **Manufacturer**: GE Medical Systems +* **Manufacturer Model Name**: Discovery +* **Device Serial Number**: SN-12345 +* **Software Versions**: v1.2.3 + +## Image Properties + +* **Resolution**: 2048 × 1500 +* **Samples Per Pixel**: 1 +* **Bits Allocated**: 16 +* **Bits Stored**: 12 +* **High Bit**: 11 +* **Pixel Representation**: 0 +* **Photometric Interpretation**: MONOCHROME2 +* **Frame Count**: 1 +* **Pixel Data Present**: Yes +``` From d195275889af2453a85d0e816cd2e60815848d61 Mon Sep 17 00:00:00 2001 From: timburman Date: Fri, 12 Jun 2026 14:07:15 +0530 Subject: [PATCH 5/8] feat: ignore private vendor tags by default to prevent metadata bloat --- .../src/markitdown_dicom/_dicom_converter.py | 10 +++++-- .../tests/test_dicom_converter.py | 29 ++++++++++++++++++- 2 files changed, 36 insertions(+), 3 deletions(-) diff --git a/packages/markitdown-dicom/src/markitdown_dicom/_dicom_converter.py b/packages/markitdown-dicom/src/markitdown_dicom/_dicom_converter.py index 35cc2c22b..fe97bb39a 100644 --- a/packages/markitdown-dicom/src/markitdown_dicom/_dicom_converter.py +++ b/packages/markitdown-dicom/src/markitdown_dicom/_dicom_converter.py @@ -23,9 +23,10 @@ class DicomConverter(DocumentConverter): Omits and redacts Patient PII (Name, ID, Birth Date) by default. """ - def __init__(self, redact_pii: bool = True, **kwargs: Any): + def __init__(self, redact_pii: bool = True, include_private_tags: bool = False, **kwargs: Any): super().__init__() self._redact_pii = redact_pii + self._include_private_tags = include_private_tags def accepts( self, @@ -67,8 +68,9 @@ def convert( "To resolve, run: pip install pydicom" ) from _dependency_exc_info[1].with_traceback(_dependency_exc_info[2]) # type: ignore - # Resolve redact_pii setting (defaulting to True) + # Resolve settings redact_pii = kwargs.get("redact_pii", self._redact_pii) + include_private_tags = kwargs.get("include_private_tags", self._include_private_tags) # Parse DICOM from the stream. # Use defer_size="1 KB" so we don't load large pixel data arrays into memory. @@ -246,6 +248,10 @@ def _render_section(title: str, fields: Dict[str, Any]) -> List[str]: custom_fields: Dict[str, str] = {} for elem in ds: + # Skip private tags unless explicitly requested + if elem.tag.is_private and not include_private_tags: + continue + # Skip file meta or pixel group if elem.tag.group in (0x0002, 0x7FE0) or elem.tag.element == 0: continue diff --git a/packages/markitdown-dicom/tests/test_dicom_converter.py b/packages/markitdown-dicom/tests/test_dicom_converter.py index 51f1fba33..138d832ac 100644 --- a/packages/markitdown-dicom/tests/test_dicom_converter.py +++ b/packages/markitdown-dicom/tests/test_dicom_converter.py @@ -183,7 +183,7 @@ def test_dicom_converter_missing_fields() -> None: def test_dicom_converter_custom_and_private_tags() -> None: """Verifies that extra textual/numeric tags and private tags are formatted correctly.""" - converter = DicomConverter() + converter = DicomConverter(include_private_tags=True) # Add custom standard tags (e.g. BodyPartExamined, InstitutionName) and a private tag # Private tags use odd group numbers, e.g., 0x0009 @@ -219,6 +219,33 @@ def test_dicom_converter_custom_and_private_tags() -> None: assert "Private Tag (0009,1001)**: Mock Private Value" in result.markdown +def test_dicom_converter_exclude_private_tags_by_default() -> None: + """Verifies that private tags are excluded by default when include_private_tags is False.""" + converter = DicomConverter() # default is False + + extra_fields = { + "InstitutionName": "Central Hospital", + } + stream = create_mock_dicom(extra_fields=extra_fields) + + ds = pydicom.dcmread(stream, force=True) + ds.private_block(0x0009, "Mock Creator", create=True) + ds[0x0009, 0x1001] = pydicom.dataelem.DataElement(0x00091001, "LO", "Mock Private Value") + + new_stream = io.BytesIO() + ds.save_as(new_stream) + new_stream.seek(0) + + result = converter.convert(new_stream, StreamInfo()) + + # Standard custom fields should still be present + assert "Institution Name**: Central Hospital" in result.markdown + + # Private tags should be excluded + assert "Mock Private Value" not in result.markdown + assert "Private Tag" not in result.markdown + + def test_markitdown_plugin_integration() -> None: """Tests that MarkItDown loads and uses the DicomConverter when enable_plugins is True.""" md = MarkItDown(enable_plugins=True) From a3ab076e1bfb9fda7b689e917c2fae7e76d660d9 Mon Sep 17 00:00:00 2001 From: timburman Date: Fri, 12 Jun 2026 14:18:11 +0530 Subject: [PATCH 6/8] refactor: implement strict-first parsing, robust PII checks, and expand metadata fields --- .../src/markitdown_dicom/_dicom_converter.py | 50 +++++++++++++------ .../tests/test_dicom_converter.py | 16 +++++- 2 files changed, 48 insertions(+), 18 deletions(-) diff --git a/packages/markitdown-dicom/src/markitdown_dicom/_dicom_converter.py b/packages/markitdown-dicom/src/markitdown_dicom/_dicom_converter.py index fe97bb39a..5effe62f0 100644 --- a/packages/markitdown-dicom/src/markitdown_dicom/_dicom_converter.py +++ b/packages/markitdown-dicom/src/markitdown_dicom/_dicom_converter.py @@ -18,9 +18,11 @@ class DicomConverter(DocumentConverter): """ - Converts DICOM (.dcm) files to structured, token-efficient Markdown. + Converts DICOM (.dcm, .dicom) files to structured, token-efficient Markdown. Extracts key Study, Series, Acquisition, Equipment, and Image characteristics. Omits and redacts Patient PII (Name, ID, Birth Date) by default. + Supports both medical imaging and industrial radiography datasets conforming to the + DICONDE standard (ASTM E2339) used in Non-Destructive Testing (NDT). """ def __init__(self, redact_pii: bool = True, include_private_tags: bool = False, **kwargs: Any): @@ -41,7 +43,9 @@ def accepts( if extension in (".dcm", ".dicom") or mimetype == "application/dicom": return True - # Peek at stream to check signature 'DICM' at offset 128 + # Peek at stream to check signature 'DICM' at offset 128. + # This acts as a robust fallback for files lacking standard extensions (like + # industrial NDT or DICONDE images). cur_pos = file_stream.tell() try: file_stream.seek(128) @@ -74,11 +78,22 @@ def convert( # Parse DICOM from the stream. # Use defer_size="1 KB" so we don't load large pixel data arrays into memory. - # force=True allows parsing datasets without file meta header. + # We attempt a strict read first (force=False) to ensure compliance and avoid false positives + # on non-DICOM streams. If that fails (e.g. for raw datasets lacking a file meta header), + # we reset and fall back to force=True. + cur_pos = file_stream.tell() try: - ds = pydicom.dcmread(file_stream, defer_size="1 KB", force=True) + ds = pydicom.dcmread(file_stream, defer_size="1 KB", force=False) if ds is None or len(ds) == 0: raise ValueError("Parsed dataset has no elements.") + except (pydicom.errors.InvalidDicomError, TypeError): + file_stream.seek(cur_pos) + try: + ds = pydicom.dcmread(file_stream, defer_size="1 KB", force=True) + if ds is None or len(ds) == 0: + raise ValueError("Parsed dataset has no elements.") + except Exception as e: + raise ValueError(f"Failed to parse DICOM file: {e}") from e except Exception as e: raise ValueError(f"Failed to parse DICOM file: {e}") from e @@ -148,6 +163,7 @@ def _get_val(keyword: str) -> Any: # 2. Study Information study_fields = { "Study Instance UID": _get_val("StudyInstanceUID"), + "Study ID": _get_val("StudyID"), "Study Date": _format_date(_get_val("StudyDate")), "Study Time": _format_time(_get_val("StudyTime")), "Study Description": _get_val("StudyDescription"), @@ -159,6 +175,8 @@ def _get_val(keyword: str) -> Any: "Series Instance UID": _get_val("SeriesInstanceUID"), "Series Number": _get_val("SeriesNumber"), "Series Description": _get_val("SeriesDescription"), + "Series Date": _format_date(_get_val("SeriesDate")), + "Series Time": _format_time(_get_val("SeriesTime")), } # 4. Acquisition Information @@ -181,14 +199,11 @@ def _get_val(keyword: str) -> Any: } # 6. Image Characteristics - rows = _get_val("Rows") - cols = _get_val("Columns") - resolution = f"{rows} × {cols}" if rows and cols else None - pixel_data_present = "Yes" if (0x7FE0, 0x0010) in ds else "No" image_fields = { - "Resolution": resolution, + "Rows": _get_val("Rows"), + "Columns": _get_val("Columns"), "Samples Per Pixel": _get_val("SamplesPerPixel"), "Bits Allocated": _get_val("BitsAllocated"), "Bits Stored": _get_val("BitsStored"), @@ -196,6 +211,9 @@ def _get_val(keyword: str) -> Any: "Pixel Representation": _get_val("PixelRepresentation"), "Photometric Interpretation": _get_val("PhotometricInterpretation"), "Frame Count": _get_val("NumberOfFrames"), + "Instance Number": _get_val("InstanceNumber"), + "SOP Class UID": _get_val("SOPClassUID"), + "SOP Instance UID": _get_val("SOPInstanceUID"), "Pixel Data Present": pixel_data_present, } @@ -230,15 +248,15 @@ def _render_section(title: str, fields: Dict[str, Any]) -> List[str]: # 8. Private / Custom textual tags when reasonable EXCLUDED_KEYWORDS = { # Study - "StudyInstanceUID", "StudyDate", "StudyTime", "StudyDescription", "AccessionNumber", + "StudyInstanceUID", "StudyDate", "StudyTime", "StudyDescription", "AccessionNumber", "StudyID", # Series - "SeriesInstanceUID", "SeriesNumber", "SeriesDescription", + "SeriesInstanceUID", "SeriesNumber", "SeriesDescription", "SeriesDate", "SeriesTime", # Acquisition "Modality", "ProtocolName", "Exposure", "ExposureTime", "KVP", "AcquisitionDate", "AcquisitionTime", # Equipment "Manufacturer", "ManufacturerModelName", "DeviceSerialNumber", "SoftwareVersions", # Image Characteristics - "Rows", "Columns", "SamplesPerPixel", "BitsAllocated", "BitsStored", "HighBit", "PixelRepresentation", "PhotometricInterpretation", "NumberOfFrames", + "Rows", "Columns", "SamplesPerPixel", "BitsAllocated", "BitsStored", "HighBit", "PixelRepresentation", "PhotometricInterpretation", "NumberOfFrames", "InstanceNumber", "SOPClassUID", "SOPInstanceUID", # Other Useful Text Fields "ImageComments", "InstitutionName", "StationName", "BodyPartExamined", # Patient info @@ -276,10 +294,10 @@ def _render_section(title: str, fields: Dict[str, Any]) -> List[str]: if val is None or val == "": continue - # Check for PII tags if redaction is enabled - lower_label = label.lower() - if redact_pii and ("patient" in lower_label or "name" in lower_label or "birth" in lower_label or "id" in lower_label): - if "sex" not in lower_label and "age" not in lower_label: + # Check for Patient PII tags if redaction is enabled (standard Patient group is 0x0010) + # Retain clinical demographics: PatientSex (0x0010, 0x0040) and PatientAge (0x0010, 0x1010) + if redact_pii: + if elem.tag.group == 0x0010 and elem.tag.element not in (0x0040, 0x1010): continue # Format list value or other type diff --git a/packages/markitdown-dicom/tests/test_dicom_converter.py b/packages/markitdown-dicom/tests/test_dicom_converter.py index 138d832ac..931fa7a1b 100644 --- a/packages/markitdown-dicom/tests/test_dicom_converter.py +++ b/packages/markitdown-dicom/tests/test_dicom_converter.py @@ -48,6 +48,10 @@ def create_mock_dicom( ds.SeriesInstanceUID = generate_uid() ds.SOPInstanceUID = file_meta.MediaStorageSOPInstanceUID ds.SOPClassUID = file_meta.MediaStorageSOPClassUID + ds.StudyID = "STUDY-1" + ds.SeriesDate = "20260612" + ds.SeriesTime = "120500" + ds.InstanceNumber = 42 ds.StudyDate = "20260612" ds.StudyTime = "120000.123" ds.StudyDescription = study_description @@ -141,9 +145,16 @@ def test_dicom_converter_default_redaction() -> None: # Verifying other standard sections are rendered properly assert "Study Description**: Mock Study" in result.markdown - assert "Resolution**: 512 × 512" in result.markdown + assert "Study ID**: STUDY-1" in result.markdown + assert "Series Date**: 2026-06-12" in result.markdown + assert "Series Time**: 12:05:00" in result.markdown + assert "Rows**: 512" in result.markdown + assert "Columns**: 512" in result.markdown + assert "Instance Number**: 42" in result.markdown assert "Study Date**: 2026-06-12" in result.markdown assert "Study Time**: 12:00:00.123" in result.markdown + assert "SOP Class UID**" in result.markdown + assert "SOP Instance UID**" in result.markdown def test_dicom_converter_disabled_redaction() -> None: @@ -175,7 +186,8 @@ def test_dicom_converter_missing_fields() -> None: result = converter.convert(stream, StreamInfo()) # Ensure no empty field or error occurs - assert "Resolution" not in result.markdown + assert "Rows" not in result.markdown + assert "Columns" not in result.markdown assert "Study Description" not in result.markdown assert "Manufacturer**" in result.markdown # Manufacturer remains since it wasn't set to None assert "DICOM File" in result.markdown From 54a3993b7f5601f64ca6d46c89a3538ca328650a Mon Sep 17 00:00:00 2001 From: timburman Date: Fri, 12 Jun 2026 14:23:26 +0530 Subject: [PATCH 7/8] docs: update README example to match the new tags and resolution layout --- packages/markitdown-dicom/README.md | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/packages/markitdown-dicom/README.md b/packages/markitdown-dicom/README.md index a30bc8d6b..0d078b6ac 100644 --- a/packages/markitdown-dicom/README.md +++ b/packages/markitdown-dicom/README.md @@ -70,6 +70,7 @@ result = md.convert("patient_scan.dcm") ## Study Information * **Study Instance UID**: 1.2.840.113619.2.134.1.20230612.98765432 +* **Study ID**: STUDY-1 * **Study Date**: 2023-06-12 * **Study Time**: 11:44:27 * **Study Description**: Chest X-Ray @@ -80,6 +81,8 @@ result = md.convert("patient_scan.dcm") * **Series Instance UID**: 1.2.840.113619.2.134.2.20230612.98765432 * **Series Number**: 1 * **Series Description**: PA View +* **Series Date**: 2023-06-12 +* **Series Time**: 11:45:00 ## Acquisition Parameters @@ -100,7 +103,8 @@ result = md.convert("patient_scan.dcm") ## Image Properties -* **Resolution**: 2048 × 1500 +* **Rows**: 2048 +* **Columns**: 1500 * **Samples Per Pixel**: 1 * **Bits Allocated**: 16 * **Bits Stored**: 12 @@ -108,5 +112,8 @@ result = md.convert("patient_scan.dcm") * **Pixel Representation**: 0 * **Photometric Interpretation**: MONOCHROME2 * **Frame Count**: 1 +* **Instance Number**: 42 +* **SOP Class UID**: 1.2.840.10008.5.1.4.1.1.2 +* **SOP Instance UID**: 1.2.840.113619.2.134.2.20230612.98765432.1 * **Pixel Data Present**: Yes ``` From 61a5fa06e243604ff48b86dbb2f7dd7510870268 Mon Sep 17 00:00:00 2001 From: timburman Date: Fri, 12 Jun 2026 14:29:44 +0530 Subject: [PATCH 8/8] docs: update readme --- packages/markitdown-dicom/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/packages/markitdown-dicom/README.md b/packages/markitdown-dicom/README.md index 0d078b6ac..ce09eac83 100644 --- a/packages/markitdown-dicom/README.md +++ b/packages/markitdown-dicom/README.md @@ -8,9 +8,9 @@ The plugin is designed to be highly memory-efficient (using deferred loading for - **Efficient Stream Peeking**: Fast detection of `.dcm` files by peeking at the `DICM` file preamble/magic bytes at offset 128. - **Memory Safety**: Uses `pydicom` with deferred value loading (`defer_size="1 KB"`) to parse headers of large multi-frame DICOM files without loading gigabytes of pixel data. -- **HIPAA-Compliant by Default**: Automatically redacts Patient Name, Patient ID, and Patient Birth Date. +- **PII-Aware by Default**: Automatically redacts Patient Name, Patient ID, and Patient Birth Date. - **Formatted Metadata**: Standardizes dates to `YYYY-MM-DD` and times to `HH:MM:SS` for downstream RAG and vector database ingestion. -- **Custom Tag Support**: Automatically extracts and formats additional standard and private vendor tags if they are simple numbers/text. +- **Custom Tag Support**: Automatically extracts additional standard metadata fields. Private/vendor tags can optionally be included and are filtered to avoid binary, sequence, and other high-volume data types. ## Installation