From 69fe564dcd5633f1d5a9f158a5fd86790ee809be Mon Sep 17 00:00:00 2001
From: timburman <aryankaushik251@gmail.com>
Date: Fri, 12 Jun 2026 13:57:46 +0530
Subject: [PATCH 1/8] build: add markitdown-dicom package boilerplate and entry
 points

---
 packages/markitdown-dicom/.gitignore          |  1 +
 packages/markitdown-dicom/pyproject.toml      | 68 +++++++++++++++++++
 .../src/markitdown_dicom/__about__.py         |  4 ++
 .../src/markitdown_dicom/__init__.py          | 14 ++++
 .../src/markitdown_dicom/_plugin.py           | 11 +++
 .../src/markitdown_dicom/py.typed             |  1 +
 6 files changed, 99 insertions(+)
 create mode 100644 packages/markitdown-dicom/.gitignore
 create mode 100644 packages/markitdown-dicom/pyproject.toml
 create mode 100644 packages/markitdown-dicom/src/markitdown_dicom/__about__.py
 create mode 100644 packages/markitdown-dicom/src/markitdown_dicom/__init__.py
 create mode 100644 packages/markitdown-dicom/src/markitdown_dicom/_plugin.py
 create mode 100644 packages/markitdown-dicom/src/markitdown_dicom/py.typed

diff --git a/packages/markitdown-dicom/.gitignore b/packages/markitdown-dicom/.gitignore
new file mode 100644
index 000000000..571830800
--- /dev/null
+++ b/packages/markitdown-dicom/.gitignore
@@ -0,0 +1 @@
+tests/test-dicom-files/
diff --git a/packages/markitdown-dicom/pyproject.toml b/packages/markitdown-dicom/pyproject.toml
new file mode 100644
index 000000000..8c2687539
--- /dev/null
+++ b/packages/markitdown-dicom/pyproject.toml
@@ -0,0 +1,68 @@
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+
+[project]
+name = "markitdown-dicom"
+dynamic = ["version"]
+description = 'DICOM converter plugin for MarkItDown - Extracts metadata from .dcm files'
+readme = "README.md"
+requires-python = ">=3.10"
+license = "MIT"
+keywords = ["markitdown", "dicom", "metadata", "pydicom"]
+authors = [
+  { name = "Aryan Kaushik", email = "aryankaushik251@gmail.com" },
+]
+classifiers = [
+  "Development Status :: 4 - Beta",
+  "Programming Language :: Python",
+  "Programming Language :: Python :: 3.10",
+  "Programming Language :: Python :: 3.11",
+  "Programming Language :: Python :: 3.12",
+  "Programming Language :: Python :: 3.13",
+  "Programming Language :: Python :: Implementation :: CPython",
+]
+dependencies = [
+  "markitdown>=0.1.0a1",
+  "pydicom>=2.4.0",
+]
+
+[project.urls]
+Documentation = "https://github.com/microsoft/markitdown#readme"
+Issues = "https://github.com/microsoft/markitdown/issues"
+Source = "https://github.com/microsoft/markitdown"
+
+[tool.hatch.version]
+path = "src/markitdown_dicom/__about__.py"
+
+[project.entry-points."markitdown.plugin"]
+dicom = "markitdown_dicom"
+
+[tool.hatch.envs.types]
+extra-dependencies = [
+  "mypy>=1.0.0",
+]
+[tool.hatch.envs.types.scripts]
+check = "mypy --install-types --non-interactive {args:src/markitdown_dicom tests}"
+
+[tool.coverage.run]
+source_pkgs = ["markitdown_dicom", "tests"]
+branch = true
+parallel = true
+omit = [
+  "src/markitdown_dicom/__about__.py",
+]
+
+[tool.coverage.paths]
+markitdown-dicom = ["src/markitdown_dicom", "*/markitdown-dicom/src/markitdown_dicom"]
+tests = ["tests", "*/markitdown-dicom/tests"]
+
+[tool.coverage.report]
+exclude_lines = [
+  "no cov",
+  "if __name__ == .__main__.:",
+  "if TYPE_CHECKING:",
+]
+
+[tool.hatch.build.targets.sdist]
+only-include = ["src/markitdown_dicom"]
diff --git a/packages/markitdown-dicom/src/markitdown_dicom/__about__.py b/packages/markitdown-dicom/src/markitdown_dicom/__about__.py
new file mode 100644
index 000000000..24f8ff955
--- /dev/null
+++ b/packages/markitdown-dicom/src/markitdown_dicom/__about__.py
@@ -0,0 +1,4 @@
+# SPDX-FileCopyrightText: 2026-present Aryan Kaushik <aryankaushik251@gmail.com>
+#
+# SPDX-License-Identifier: MIT
+__version__ = "0.1.0a1"
diff --git a/packages/markitdown-dicom/src/markitdown_dicom/__init__.py b/packages/markitdown-dicom/src/markitdown_dicom/__init__.py
new file mode 100644
index 000000000..1da335c88
--- /dev/null
+++ b/packages/markitdown-dicom/src/markitdown_dicom/__init__.py
@@ -0,0 +1,14 @@
+# SPDX-FileCopyrightText: 2026-present Aryan Kaushik <aryankaushik251@gmail.com>
+#
+# SPDX-License-Identifier: MIT
+
+from ._plugin import __plugin_interface_version__, register_converters
+from ._dicom_converter import DicomConverter
+from .__about__ import __version__
+
+__all__ = [
+    "__version__",
+    "__plugin_interface_version__",
+    "register_converters",
+    "DicomConverter",
+]
diff --git a/packages/markitdown-dicom/src/markitdown_dicom/_plugin.py b/packages/markitdown-dicom/src/markitdown_dicom/_plugin.py
new file mode 100644
index 000000000..4106b49db
--- /dev/null
+++ b/packages/markitdown-dicom/src/markitdown_dicom/_plugin.py
@@ -0,0 +1,11 @@
+from typing import Any
+from markitdown import MarkItDown
+from ._dicom_converter import DicomConverter
+
+__plugin_interface_version__ = 1
+
+def register_converters(markitdown: MarkItDown, **kwargs: Any) -> None:
+    """
+    Called during construction of MarkItDown instances to register converters provided by plugins.
+    """
+    markitdown.register_converter(DicomConverter(**kwargs))
diff --git a/packages/markitdown-dicom/src/markitdown_dicom/py.typed b/packages/markitdown-dicom/src/markitdown_dicom/py.typed
new file mode 100644
index 000000000..7632ecf77
--- /dev/null
+++ b/packages/markitdown-dicom/src/markitdown_dicom/py.typed
@@ -0,0 +1 @@
+# Marker file for PEP 561

From 61be04341c7266389925c03a3469df7a100a1690 Mon Sep 17 00:00:00 2001
From: timburman <aryankaushik251@gmail.com>
Date: Fri, 12 Jun 2026 13:58:03 +0530
Subject: [PATCH 2/8] feat: implement DicomConverter with accepts and metadata
 formatting

---
 .../src/markitdown_dicom/_dicom_converter.py  | 300 ++++++++++++++++++
 1 file changed, 300 insertions(+)
 create mode 100644 packages/markitdown-dicom/src/markitdown_dicom/_dicom_converter.py

diff --git a/packages/markitdown-dicom/src/markitdown_dicom/_dicom_converter.py b/packages/markitdown-dicom/src/markitdown_dicom/_dicom_converter.py
new file mode 100644
index 000000000..35cc2c22b
--- /dev/null
+++ b/packages/markitdown-dicom/src/markitdown_dicom/_dicom_converter.py
@@ -0,0 +1,300 @@
+# SPDX-FileCopyrightText: 2026-present Aryan Kaushik <aryankaushik251@gmail.com>
+#
+# SPDX-License-Identifier: MIT
+
+import re
+import sys
+from typing import Any, BinaryIO, Dict, List, Optional
+
+from markitdown import DocumentConverter, DocumentConverterResult, StreamInfo, MissingDependencyException
+
+# Lazy loading of pydicom to raise MissingDependencyException during conversion if not installed.
+_dependency_exc_info = None
+try:
+    import pydicom
+except ImportError:
+    _dependency_exc_info = sys.exc_info()
+
+
+class DicomConverter(DocumentConverter):
+    """
+    Converts DICOM (.dcm) files to structured, token-efficient Markdown.
+    Extracts key Study, Series, Acquisition, Equipment, and Image characteristics.
+    Omits and redacts Patient PII (Name, ID, Birth Date) by default.
+    """
+
+    def __init__(self, redact_pii: bool = True, **kwargs: Any):
+        super().__init__()
+        self._redact_pii = redact_pii
+
+    def accepts(
+        self,
+        file_stream: BinaryIO,
+        stream_info: StreamInfo,
+        **kwargs: Any,
+    ) -> bool:
+        mimetype = (stream_info.mimetype or "").lower()
+        extension = (stream_info.extension or "").lower()
+
+        # Check standard extension / MIME type
+        if extension in (".dcm", ".dicom") or mimetype == "application/dicom":
+            return True
+
+        # Peek at stream to check signature 'DICM' at offset 128
+        cur_pos = file_stream.tell()
+        try:
+            file_stream.seek(128)
+            sig = file_stream.read(4)
+            if sig == b"DICM":
+                return True
+        except Exception:
+            pass
+        finally:
+            file_stream.seek(cur_pos)
+
+        return False
+
+    def convert(
+        self,
+        file_stream: BinaryIO,
+        stream_info: StreamInfo,
+        **kwargs: Any,
+    ) -> DocumentConverterResult:
+        # Check if pydicom is available
+        if _dependency_exc_info is not None:
+            raise MissingDependencyException(
+                "markitdown-dicom requires pydicom to be installed. "
+                "To resolve, run: pip install pydicom"
+            ) from _dependency_exc_info[1].with_traceback(_dependency_exc_info[2])  # type: ignore
+
+        # Resolve redact_pii setting (defaulting to True)
+        redact_pii = kwargs.get("redact_pii", self._redact_pii)
+
+        # Parse DICOM from the stream.
+        # Use defer_size="1 KB" so we don't load large pixel data arrays into memory.
+        # force=True allows parsing datasets without file meta header.
+        try:
+            ds = pydicom.dcmread(file_stream, defer_size="1 KB", force=True)
+            if ds is None or len(ds) == 0:
+                raise ValueError("Parsed dataset has no elements.")
+        except Exception as e:
+            raise ValueError(f"Failed to parse DICOM file: {e}") from e
+
+        # Extracted elements
+        lines = ["# DICOM File", ""]
+
+        # Date and Time Formatter helpers
+        def _format_date(val: Any) -> Optional[str]:
+            if not val:
+                return None
+            val_str = str(val).strip()
+            if len(val_str) == 8 and val_str.isdigit():
+                return f"{val_str[0:4]}-{val_str[4:6]}-{val_str[6:8]}"
+            return val_str
+
+        def _format_time(val: Any) -> Optional[str]:
+            if not val:
+                return None
+            val_str = str(val).strip()
+            if "." in val_str:
+                time_part, frac_part = val_str.split(".", 1)
+            else:
+                time_part, frac_part = val_str, ""
+
+            if len(time_part) >= 6 and time_part.isdigit():
+                formatted = f"{time_part[0:2]}:{time_part[2:4]}:{time_part[4:6]}"
+                if frac_part:
+                    formatted += f".{frac_part}"
+                return formatted
+            elif len(time_part) >= 4 and time_part.isdigit():
+                formatted = f"{time_part[0:2]}:{time_part[2:4]}"
+                if frac_part:
+                    formatted += f".{frac_part}"
+                return formatted
+            return val_str
+
+        def _get_val(keyword: str) -> Any:
+            val = getattr(ds, keyword, None)
+            if val is None:
+                return None
+            if isinstance(val, (list, tuple)) or type(val).__name__ == "MultiValue":
+                return ", ".join(str(x) for x in val)
+            return val
+
+        # Define category structures
+        # 1. Patient Information
+        p_name = _get_val("PatientName")
+        p_id = _get_val("PatientID")
+        p_dob = _get_val("PatientBirthDate")
+
+        if redact_pii:
+            p_name = "[REDACTED]" if p_name is not None else None
+            p_id = "[REDACTED]" if p_id is not None else None
+            p_dob = "[REDACTED]" if p_dob is not None else None
+        else:
+            if p_name:
+                p_name = str(p_name).replace("^", " ").strip()
+
+        patient_fields = {
+            "Patient Name": p_name,
+            "Patient ID": p_id,
+            "Patient Birth Date": _format_date(p_dob),
+            "Patient Sex": _get_val("PatientSex"),
+            "Patient Age": _get_val("PatientAge"),
+        }
+
+        # 2. Study Information
+        study_fields = {
+            "Study Instance UID": _get_val("StudyInstanceUID"),
+            "Study Date": _format_date(_get_val("StudyDate")),
+            "Study Time": _format_time(_get_val("StudyTime")),
+            "Study Description": _get_val("StudyDescription"),
+            "Accession Number": _get_val("AccessionNumber"),
+        }
+
+        # 3. Series Information
+        series_fields = {
+            "Series Instance UID": _get_val("SeriesInstanceUID"),
+            "Series Number": _get_val("SeriesNumber"),
+            "Series Description": _get_val("SeriesDescription"),
+        }
+
+        # 4. Acquisition Information
+        acquisition_fields = {
+            "Modality": _get_val("Modality"),
+            "Protocol Name": _get_val("ProtocolName"),
+            "Exposure": _get_val("Exposure"),
+            "Exposure Time": _get_val("ExposureTime"),
+            "KVP": _get_val("KVP"),
+            "Acquisition Date": _format_date(_get_val("AcquisitionDate")),
+            "Acquisition Time": _format_time(_get_val("AcquisitionTime")),
+        }
+
+        # 5. Equipment Information
+        equipment_fields = {
+            "Manufacturer": _get_val("Manufacturer"),
+            "Manufacturer Model Name": _get_val("ManufacturerModelName"),
+            "Device Serial Number": _get_val("DeviceSerialNumber"),
+            "Software Versions": _get_val("SoftwareVersions"),
+        }
+
+        # 6. Image Characteristics
+        rows = _get_val("Rows")
+        cols = _get_val("Columns")
+        resolution = f"{rows} × {cols}" if rows and cols else None
+
+        pixel_data_present = "Yes" if (0x7FE0, 0x0010) in ds else "No"
+
+        image_fields = {
+            "Resolution": resolution,
+            "Samples Per Pixel": _get_val("SamplesPerPixel"),
+            "Bits Allocated": _get_val("BitsAllocated"),
+            "Bits Stored": _get_val("BitsStored"),
+            "High Bit": _get_val("HighBit"),
+            "Pixel Representation": _get_val("PixelRepresentation"),
+            "Photometric Interpretation": _get_val("PhotometricInterpretation"),
+            "Frame Count": _get_val("NumberOfFrames"),
+            "Pixel Data Present": pixel_data_present,
+        }
+
+        # 7. Other Useful Text Fields
+        other_fields = {
+            "Image Comments": _get_val("ImageComments"),
+            "Institution Name": _get_val("InstitutionName"),
+            "Station Name": _get_val("StationName"),
+            "Body Part Examined": _get_val("BodyPartExamined"),
+        }
+
+        # Helper to render sections
+        def _render_section(title: str, fields: Dict[str, Any]) -> List[str]:
+            active = {k: v for k, v in fields.items() if v is not None and str(v).strip() != ""}
+            if not active:
+                return []
+            sec_lines = [f"## {title}", ""]
+            for k, v in active.items():
+                sec_lines.append(f"* **{k}**: {v}")
+            sec_lines.append("")
+            return sec_lines
+
+        # Predefined sections
+        lines.extend(_render_section("Patient Information", patient_fields))
+        lines.extend(_render_section("Study Information", study_fields))
+        lines.extend(_render_section("Series Information", series_fields))
+        lines.extend(_render_section("Acquisition Parameters", acquisition_fields))
+        lines.extend(_render_section("Equipment", equipment_fields))
+        lines.extend(_render_section("Image Properties", image_fields))
+        lines.extend(_render_section("Other Information", other_fields))
+
+        # 8. Private / Custom textual tags when reasonable
+        EXCLUDED_KEYWORDS = {
+            # Study
+            "StudyInstanceUID", "StudyDate", "StudyTime", "StudyDescription", "AccessionNumber",
+            # Series
+            "SeriesInstanceUID", "SeriesNumber", "SeriesDescription",
+            # Acquisition
+            "Modality", "ProtocolName", "Exposure", "ExposureTime", "KVP", "AcquisitionDate", "AcquisitionTime",
+            # Equipment
+            "Manufacturer", "ManufacturerModelName", "DeviceSerialNumber", "SoftwareVersions",
+            # Image Characteristics
+            "Rows", "Columns", "SamplesPerPixel", "BitsAllocated", "BitsStored", "HighBit", "PixelRepresentation", "PhotometricInterpretation", "NumberOfFrames",
+            # Other Useful Text Fields
+            "ImageComments", "InstitutionName", "StationName", "BodyPartExamined",
+            # Patient info
+            "PatientName", "PatientID", "PatientBirthDate", "PatientSex", "PatientAge"
+        }
+        EXCLUDED_VRS = {"OB", "OW", "OF", "OD", "SQ", "UN"}
+
+        custom_fields: Dict[str, str] = {}
+        for elem in ds:
+            # Skip file meta or pixel group
+            if elem.tag.group in (0x0002, 0x7FE0) or elem.tag.element == 0:
+                continue
+
+            # Skip binary, sequence, or unknown VRs
+            if elem.VR in EXCLUDED_VRS:
+                continue
+
+            keyword = elem.keyword
+            if not keyword:
+                if elem.tag.is_private:
+                    label = f"Private Tag ({elem.tag.group:04X},{elem.tag.element:04X})"
+                else:
+                    label = f"Tag ({elem.tag.group:04X},{elem.tag.element:04X})"
+            else:
+                if keyword in EXCLUDED_KEYWORDS:
+                    continue
+                # Split CamelCase to separate words
+                label = re.sub(r'(?<!^)(?=[A-Z])', ' ', keyword)
+
+            val = elem.value
+            if val is None or val == "":
+                continue
+
+            # Check for PII tags if redaction is enabled
+            lower_label = label.lower()
+            if redact_pii and ("patient" in lower_label or "name" in lower_label or "birth" in lower_label or "id" in lower_label):
+                if "sex" not in lower_label and "age" not in lower_label:
+                    continue
+
+            # Format list value or other type
+            if isinstance(val, (list, tuple)) or type(val).__name__ == "MultiValue":
+                val_str = ", ".join(str(x) for x in val)
+            else:
+                val_str = str(val)
+
+            if elem.VR == "PN":
+                val_str = val_str.replace("^", " ").strip()
+
+            custom_fields[label] = val_str
+
+        # Render custom & private tags if any
+        if custom_fields:
+            sorted_custom = dict(sorted(custom_fields.items()))
+            lines.extend(_render_section("Additional Fields", sorted_custom))
+
+        # Strip extra trailing whitespaces/newlines and return result
+        markdown_content = "\n".join(lines).strip() + "\n"
+        return DocumentConverterResult(
+            title=study_fields.get("Study Description") or "DICOM Document",
+            markdown=markdown_content,
+        )

From 9e432d8ed86fa456ab254b6884f3286e0c5edc0a Mon Sep 17 00:00:00 2001
From: timburman <aryankaushik251@gmail.com>
Date: Fri, 12 Jun 2026 13:58:10 +0530
Subject: [PATCH 3/8] test: add deterministic in-memory unit tests for
 DicomConverter

---
 packages/markitdown-dicom/tests/__init__.py   |   3 +
 .../tests/test_dicom_converter.py             | 240 ++++++++++++++++++
 2 files changed, 243 insertions(+)
 create mode 100644 packages/markitdown-dicom/tests/__init__.py
 create mode 100644 packages/markitdown-dicom/tests/test_dicom_converter.py

diff --git a/packages/markitdown-dicom/tests/__init__.py b/packages/markitdown-dicom/tests/__init__.py
new file mode 100644
index 000000000..aa8931747
--- /dev/null
+++ b/packages/markitdown-dicom/tests/__init__.py
@@ -0,0 +1,3 @@
+# SPDX-FileCopyrightText: 2026-present Aryan Kaushik <aryankaushik251@gmail.com>
+#
+# SPDX-License-Identifier: MIT
diff --git a/packages/markitdown-dicom/tests/test_dicom_converter.py b/packages/markitdown-dicom/tests/test_dicom_converter.py
new file mode 100644
index 000000000..51f1fba33
--- /dev/null
+++ b/packages/markitdown-dicom/tests/test_dicom_converter.py
@@ -0,0 +1,240 @@
+# SPDX-FileCopyrightText: 2026-present Aryan Kaushik <aryankaushik251@gmail.com>
+#
+# SPDX-License-Identifier: MIT
+
+import io
+import pytest
+from typing import Dict, Any, Optional
+
+import pydicom
+from pydicom.dataset import FileDataset, FileMetaDataset
+from pydicom.uid import ExplicitVRLittleEndian, generate_uid
+
+from markitdown import MarkItDown, StreamInfo
+from markitdown_dicom import DicomConverter
+
+
+def create_mock_dicom(
+    patient_name: Optional[str] = "Test^Patient",
+    patient_id: Optional[str] = "123456",
+    patient_dob: Optional[str] = "19800101",
+    modality: str = "CT",
+    study_description: str = "Mock Study",
+    rows: Optional[int] = 512,
+    cols: Optional[int] = 512,
+    has_pixel_data: bool = True,
+    extra_fields: Optional[Dict[str, Any]] = None,
+) -> io.BytesIO:
+    """Helper to programmatically generate a valid DICOM file in memory."""
+    file_meta = FileMetaDataset()
+    file_meta.MediaStorageSOPClassUID = "1.2.840.10008.5.1.4.1.1.2"  # CT Image Storage
+    file_meta.MediaStorageSOPInstanceUID = generate_uid()
+    file_meta.TransferSyntaxUID = ExplicitVRLittleEndian
+    file_meta.ImplementationClassUID = pydicom.uid.PYDICOM_IMPLEMENTATION_UID
+
+    ds = FileDataset("in_memory.dcm", {}, file_meta=file_meta, preamble=b"\0" * 128)
+
+    if patient_name is not None:
+        ds.PatientName = patient_name
+    if patient_id is not None:
+        ds.PatientID = patient_id
+    if patient_dob is not None:
+        ds.PatientBirthDate = patient_dob
+
+    ds.PatientSex = "M"
+    ds.PatientAge = "045Y"
+    ds.Modality = modality
+    ds.StudyInstanceUID = generate_uid()
+    ds.SeriesInstanceUID = generate_uid()
+    ds.SOPInstanceUID = file_meta.MediaStorageSOPInstanceUID
+    ds.SOPClassUID = file_meta.MediaStorageSOPClassUID
+    ds.StudyDate = "20260612"
+    ds.StudyTime = "120000.123"
+    ds.StudyDescription = study_description
+    ds.AccessionNumber = "ACC-12345"
+    ds.SeriesNumber = 1
+    ds.SeriesDescription = "PA View"
+    ds.Manufacturer = "GE Medical Systems"
+
+    if rows is not None:
+        ds.Rows = rows
+    if cols is not None:
+        ds.Columns = cols
+
+    ds.SamplesPerPixel = 1
+    ds.BitsAllocated = 16
+    ds.BitsStored = 16
+    ds.HighBit = 15
+    ds.PixelRepresentation = 0
+    ds.PhotometricInterpretation = "MONOCHROME2"
+
+    if has_pixel_data:
+        # Use simple dummy bytes for pixel data
+        ds.PixelData = b"\x00" * 100
+
+    if extra_fields:
+        for keyword, val in extra_fields.items():
+            setattr(ds, keyword, val)
+
+    buffer = io.BytesIO()
+    ds.save_as(buffer, enforce_file_format=False)
+    buffer.seek(0)
+    return buffer
+
+
+def test_dicom_converter_accepts() -> None:
+    """Verifies that the DicomConverter accepts DICOM streams using metadata or signature checks."""
+    converter = DicomConverter()
+
+    # Case 1: Acceptance by extension
+    assert converter.accepts(
+        io.BytesIO(b""),
+        StreamInfo(extension=".dcm"),
+    )
+    assert converter.accepts(
+        io.BytesIO(b""),
+        StreamInfo(extension=".dicom"),
+    )
+
+    # Case 2: Acceptance by MIME type
+    assert converter.accepts(
+        io.BytesIO(b""),
+        StreamInfo(mimetype="application/dicom"),
+    )
+
+    # Case 3: Acceptance by peeking at DICM signature at offset 128
+    mock_dicom = create_mock_dicom()
+    assert converter.accepts(
+        mock_dicom,
+        StreamInfo(extension=".raw"),  # Wrong extension, but valid stream
+    )
+
+    # Case 4: Rejection of non-DICOM content
+    assert not converter.accepts(
+        io.BytesIO(b"\x00" * 200),
+        StreamInfo(extension=".txt"),
+    )
+
+
+def test_dicom_converter_default_redaction() -> None:
+    """Tests that by default, patient identifying details are redacted but clinical demographics are kept."""
+    converter = DicomConverter()
+    stream = create_mock_dicom(
+        patient_name="Doe^John",
+        patient_id="PID-999",
+        patient_dob="19750505",
+    )
+
+    result = converter.convert(stream, StreamInfo())
+
+    # PatientName, PatientID, and PatientBirthDate must be redacted
+    assert "Doe John" not in result.markdown
+    assert "PID-999" not in result.markdown
+    assert "1975-05-05" not in result.markdown
+    assert "Patient Name**: [REDACTED]" in result.markdown
+    assert "Patient ID**: [REDACTED]" in result.markdown
+    assert "Patient Birth Date**: [REDACTED]" in result.markdown
+
+    # Patient Sex and Age should remain as clinical metadata
+    assert "Patient Sex**: M" in result.markdown
+    assert "Patient Age**: 045Y" in result.markdown
+
+    # Verifying other standard sections are rendered properly
+    assert "Study Description**: Mock Study" in result.markdown
+    assert "Resolution**: 512 × 512" in result.markdown
+    assert "Study Date**: 2026-06-12" in result.markdown
+    assert "Study Time**: 12:00:00.123" in result.markdown
+
+
+def test_dicom_converter_disabled_redaction() -> None:
+    """Tests that when redact_pii is set to False, identifiers are extracted normally."""
+    converter = DicomConverter(redact_pii=False)
+    stream = create_mock_dicom(
+        patient_name="Doe^John",
+        patient_id="PID-999",
+        patient_dob="19750505",
+    )
+
+    result = converter.convert(stream, StreamInfo())
+
+    assert "Patient Name**: Doe John" in result.markdown
+    assert "Patient ID**: PID-999" in result.markdown
+    assert "Patient Birth Date**: 1975-05-05" in result.markdown
+
+
+def test_dicom_converter_missing_fields() -> None:
+    """Verifies that missing optional tags do not raise exceptions and are simply omitted."""
+    converter = DicomConverter()
+    # Create DICOM file with no manufacturer, resolution, or description
+    stream = create_mock_dicom(
+        study_description="",
+        rows=None,
+        cols=None,
+    )
+
+    result = converter.convert(stream, StreamInfo())
+
+    # Ensure no empty field or error occurs
+    assert "Resolution" not in result.markdown
+    assert "Study Description" not in result.markdown
+    assert "Manufacturer**" in result.markdown  # Manufacturer remains since it wasn't set to None
+    assert "DICOM File" in result.markdown
+
+
+def test_dicom_converter_custom_and_private_tags() -> None:
+    """Verifies that extra textual/numeric tags and private tags are formatted correctly."""
+    converter = DicomConverter()
+
+    # Add custom standard tags (e.g. BodyPartExamined, InstitutionName) and a private tag
+    # Private tags use odd group numbers, e.g., 0x0009
+    extra_fields = {
+        "InstitutionName": "Central Hospital",
+        "BodyPartExamined": "CHEST",
+        "InstitutionAddress": "123 Clinic Rd",
+    }
+    stream = create_mock_dicom(extra_fields=extra_fields)
+
+    # Let's add a raw private tag directly to the dataset
+    ds = pydicom.dcmread(stream, force=True)
+    # Register private creator block
+    ds.private_block(0x0009, "Mock Creator", create=True)
+    # Add a private element in group 0x0009
+    ds[0x0009, 0x1001] = pydicom.dataelem.DataElement(0x00091001, "LO", "Mock Private Value")
+
+    # Save modified dataset to a new stream
+    new_stream = io.BytesIO()
+    ds.save_as(new_stream)
+    new_stream.seek(0)
+
+    result = converter.convert(new_stream, StreamInfo())
+
+    # Verify standard custom fields
+    assert "Institution Name**: Central Hospital" in result.markdown
+    assert "Body Part Examined**: CHEST" in result.markdown
+
+    # Verify additional standard fields split camelcase
+    assert "Institution Address**: 123 Clinic Rd" in result.markdown
+
+    # Verify private tag rendering
+    assert "Private Tag (0009,1001)**: Mock Private Value" in result.markdown
+
+
+def test_markitdown_plugin_integration() -> None:
+    """Tests that MarkItDown loads and uses the DicomConverter when enable_plugins is True."""
+    md = MarkItDown(enable_plugins=True)
+    stream = create_mock_dicom(study_description="Integration Test")
+
+    # Convert using the file stream with hint
+    result = md.convert(stream, stream_info=StreamInfo(extension=".dcm"))
+
+    assert "Study Description**: Integration Test" in result.markdown
+    assert "Patient Name**: [REDACTED]" in result.markdown
+
+
+def test_corrupted_dicom() -> None:
+    """Verifies that a corrupted DICOM stream raises ValueError during conversion."""
+    converter = DicomConverter()
+    corrupt_stream = io.BytesIO(b"DICM" + b"\xff" * 100)
+
+    with pytest.raises(ValueError, match="Failed to parse DICOM file"):
+        converter.convert(corrupt_stream, StreamInfo())

From 9a0dd91bffb43feb0610c3a1804ba21341ec4050 Mon Sep 17 00:00:00 2001
From: timburman <aryankaushik251@gmail.com>
Date: Fri, 12 Jun 2026 13:58:17 +0530
Subject: [PATCH 4/8] docs: add installation and usage guide in
 markitdown-dicom README

---
 packages/markitdown-dicom/README.md | 112 ++++++++++++++++++++++++++++
 1 file changed, 112 insertions(+)
 create mode 100644 packages/markitdown-dicom/README.md

diff --git a/packages/markitdown-dicom/README.md b/packages/markitdown-dicom/README.md
new file mode 100644
index 000000000..a30bc8d6b
--- /dev/null
+++ b/packages/markitdown-dicom/README.md
@@ -0,0 +1,112 @@
+# MarkItDown DICOM Plugin (`markitdown-dicom`)
+
+This is a plugin for [MarkItDown](https://github.com/microsoft/markitdown) that adds support for converting DICOM (`.dcm`) files into LLM-friendly Markdown metadata representations. 
+
+The plugin is designed to be highly memory-efficient (using deferred loading for pixel data) and token-efficient, ignoring raw pixel arrays while extracting clinically-relevant metadata.
+
+## Features
+
+- **Efficient Stream Peeking**: Fast detection of `.dcm` files by peeking at the `DICM` file preamble/magic bytes at offset 128.
+- **Memory Safety**: Uses `pydicom` with deferred value loading (`defer_size="1 KB"`) to parse headers of large multi-frame DICOM files without loading gigabytes of pixel data.
+- **HIPAA-Compliant by Default**: Automatically redacts Patient Name, Patient ID, and Patient Birth Date.
+- **Formatted Metadata**: Standardizes dates to `YYYY-MM-DD` and times to `HH:MM:SS` for downstream RAG and vector database ingestion.
+- **Custom Tag Support**: Automatically extracts and formats additional standard and private vendor tags if they are simple numbers/text.
+
+## Installation
+
+Install the plugin along with MarkItDown:
+
+```bash
+pip install markitdown-dicom
+```
+
+## Usage
+
+### Command Line Interface
+
+Use the `-p` (or `--use-plugins`) option to enable third-party plugins:
+
+```bash
+markitdown --use-plugins patient_scan.dcm -o patient_scan.md
+```
+
+### Python API
+
+```python
+from markitdown import MarkItDown
+
+# Initialize MarkItDown with plugins enabled
+md = MarkItDown(enable_plugins=True)
+
+# Convert a DICOM file
+result = md.convert("patient_scan.dcm")
+print(result.text_content)
+```
+
+### Disabling PII Redaction
+
+If you are working in a fully de-identified or secure clinical environment and want to retain Patient Name and Patient ID, you can disable redaction:
+
+```python
+from markitdown import MarkItDown
+
+md = MarkItDown(enable_plugins=True, redact_pii=False)
+result = md.convert("patient_scan.dcm")
+```
+
+## Example Output
+
+```markdown
+# DICOM File
+
+## Patient Information
+
+* **Patient Name**: [REDACTED]
+* **Patient ID**: [REDACTED]
+* **Patient Birth Date**: [REDACTED]
+* **Patient Sex**: M
+* **Patient Age**: 045Y
+
+## Study Information
+
+* **Study Instance UID**: 1.2.840.113619.2.134.1.20230612.98765432
+* **Study Date**: 2023-06-12
+* **Study Time**: 11:44:27
+* **Study Description**: Chest X-Ray
+* **Accession Number**: ACC-98765
+
+## Series Information
+
+* **Series Instance UID**: 1.2.840.113619.2.134.2.20230612.98765432
+* **Series Number**: 1
+* **Series Description**: PA View
+
+## Acquisition Parameters
+
+* **Modality**: DX
+* **Protocol Name**: Chest PA
+* **Exposure**: 2
+* **Exposure Time**: 10
+* **KVP**: 120
+* **Acquisition Date**: 2023-06-12
+* **Acquisition Time**: 11:45:00
+
+## Equipment
+
+* **Manufacturer**: GE Medical Systems
+* **Manufacturer Model Name**: Discovery
+* **Device Serial Number**: SN-12345
+* **Software Versions**: v1.2.3
+
+## Image Properties
+
+* **Resolution**: 2048 × 1500
+* **Samples Per Pixel**: 1
+* **Bits Allocated**: 16
+* **Bits Stored**: 12
+* **High Bit**: 11
+* **Pixel Representation**: 0
+* **Photometric Interpretation**: MONOCHROME2
+* **Frame Count**: 1
+* **Pixel Data Present**: Yes
+```

From d195275889af2453a85d0e816cd2e60815848d61 Mon Sep 17 00:00:00 2001
From: timburman <aryankaushik251@gmail.com>
Date: Fri, 12 Jun 2026 14:07:15 +0530
Subject: [PATCH 5/8] feat: ignore private vendor tags by default to prevent
 metadata bloat

---
 .../src/markitdown_dicom/_dicom_converter.py  | 10 +++++--
 .../tests/test_dicom_converter.py             | 29 ++++++++++++++++++-
 2 files changed, 36 insertions(+), 3 deletions(-)

diff --git a/packages/markitdown-dicom/src/markitdown_dicom/_dicom_converter.py b/packages/markitdown-dicom/src/markitdown_dicom/_dicom_converter.py
index 35cc2c22b..fe97bb39a 100644
--- a/packages/markitdown-dicom/src/markitdown_dicom/_dicom_converter.py
+++ b/packages/markitdown-dicom/src/markitdown_dicom/_dicom_converter.py
@@ -23,9 +23,10 @@ class DicomConverter(DocumentConverter):
     Omits and redacts Patient PII (Name, ID, Birth Date) by default.
     """
 
-    def __init__(self, redact_pii: bool = True, **kwargs: Any):
+    def __init__(self, redact_pii: bool = True, include_private_tags: bool = False, **kwargs: Any):
         super().__init__()
         self._redact_pii = redact_pii
+        self._include_private_tags = include_private_tags
 
     def accepts(
         self,
@@ -67,8 +68,9 @@ def convert(
                 "To resolve, run: pip install pydicom"
             ) from _dependency_exc_info[1].with_traceback(_dependency_exc_info[2])  # type: ignore
 
-        # Resolve redact_pii setting (defaulting to True)
+        # Resolve settings
         redact_pii = kwargs.get("redact_pii", self._redact_pii)
+        include_private_tags = kwargs.get("include_private_tags", self._include_private_tags)
 
         # Parse DICOM from the stream.
         # Use defer_size="1 KB" so we don't load large pixel data arrays into memory.
@@ -246,6 +248,10 @@ def _render_section(title: str, fields: Dict[str, Any]) -> List[str]:
 
         custom_fields: Dict[str, str] = {}
         for elem in ds:
+            # Skip private tags unless explicitly requested
+            if elem.tag.is_private and not include_private_tags:
+                continue
+
             # Skip file meta or pixel group
             if elem.tag.group in (0x0002, 0x7FE0) or elem.tag.element == 0:
                 continue
diff --git a/packages/markitdown-dicom/tests/test_dicom_converter.py b/packages/markitdown-dicom/tests/test_dicom_converter.py
index 51f1fba33..138d832ac 100644
--- a/packages/markitdown-dicom/tests/test_dicom_converter.py
+++ b/packages/markitdown-dicom/tests/test_dicom_converter.py
@@ -183,7 +183,7 @@ def test_dicom_converter_missing_fields() -> None:
 
 def test_dicom_converter_custom_and_private_tags() -> None:
     """Verifies that extra textual/numeric tags and private tags are formatted correctly."""
-    converter = DicomConverter()
+    converter = DicomConverter(include_private_tags=True)
 
     # Add custom standard tags (e.g. BodyPartExamined, InstitutionName) and a private tag
     # Private tags use odd group numbers, e.g., 0x0009
@@ -219,6 +219,33 @@ def test_dicom_converter_custom_and_private_tags() -> None:
     assert "Private Tag (0009,1001)**: Mock Private Value" in result.markdown
 
 
+def test_dicom_converter_exclude_private_tags_by_default() -> None:
+    """Verifies that private tags are excluded by default when include_private_tags is False."""
+    converter = DicomConverter()  # default is False
+
+    extra_fields = {
+        "InstitutionName": "Central Hospital",
+    }
+    stream = create_mock_dicom(extra_fields=extra_fields)
+
+    ds = pydicom.dcmread(stream, force=True)
+    ds.private_block(0x0009, "Mock Creator", create=True)
+    ds[0x0009, 0x1001] = pydicom.dataelem.DataElement(0x00091001, "LO", "Mock Private Value")
+
+    new_stream = io.BytesIO()
+    ds.save_as(new_stream)
+    new_stream.seek(0)
+
+    result = converter.convert(new_stream, StreamInfo())
+
+    # Standard custom fields should still be present
+    assert "Institution Name**: Central Hospital" in result.markdown
+
+    # Private tags should be excluded
+    assert "Mock Private Value" not in result.markdown
+    assert "Private Tag" not in result.markdown
+
+
 def test_markitdown_plugin_integration() -> None:
     """Tests that MarkItDown loads and uses the DicomConverter when enable_plugins is True."""
     md = MarkItDown(enable_plugins=True)

From a3ab076e1bfb9fda7b689e917c2fae7e76d660d9 Mon Sep 17 00:00:00 2001
From: timburman <aryankaushik251@gmail.com>
Date: Fri, 12 Jun 2026 14:18:11 +0530
Subject: [PATCH 6/8] refactor: implement strict-first parsing, robust PII
 checks, and expand metadata fields

---
 .../src/markitdown_dicom/_dicom_converter.py  | 50 +++++++++++++------
 .../tests/test_dicom_converter.py             | 16 +++++-
 2 files changed, 48 insertions(+), 18 deletions(-)

diff --git a/packages/markitdown-dicom/src/markitdown_dicom/_dicom_converter.py b/packages/markitdown-dicom/src/markitdown_dicom/_dicom_converter.py
index fe97bb39a..5effe62f0 100644
--- a/packages/markitdown-dicom/src/markitdown_dicom/_dicom_converter.py
+++ b/packages/markitdown-dicom/src/markitdown_dicom/_dicom_converter.py
@@ -18,9 +18,11 @@
 
 class DicomConverter(DocumentConverter):
     """
-    Converts DICOM (.dcm) files to structured, token-efficient Markdown.
+    Converts DICOM (.dcm, .dicom) files to structured, token-efficient Markdown.
     Extracts key Study, Series, Acquisition, Equipment, and Image characteristics.
     Omits and redacts Patient PII (Name, ID, Birth Date) by default.
+    Supports both medical imaging and industrial radiography datasets conforming to the
+    DICONDE standard (ASTM E2339) used in Non-Destructive Testing (NDT).
     """
 
     def __init__(self, redact_pii: bool = True, include_private_tags: bool = False, **kwargs: Any):
@@ -41,7 +43,9 @@ def accepts(
         if extension in (".dcm", ".dicom") or mimetype == "application/dicom":
             return True
 
-        # Peek at stream to check signature 'DICM' at offset 128
+        # Peek at stream to check signature 'DICM' at offset 128.
+        # This acts as a robust fallback for files lacking standard extensions (like
+        # industrial NDT or DICONDE images).
         cur_pos = file_stream.tell()
         try:
             file_stream.seek(128)
@@ -74,11 +78,22 @@ def convert(
 
         # Parse DICOM from the stream.
         # Use defer_size="1 KB" so we don't load large pixel data arrays into memory.
-        # force=True allows parsing datasets without file meta header.
+        # We attempt a strict read first (force=False) to ensure compliance and avoid false positives
+        # on non-DICOM streams. If that fails (e.g. for raw datasets lacking a file meta header),
+        # we reset and fall back to force=True.
+        cur_pos = file_stream.tell()
         try:
-            ds = pydicom.dcmread(file_stream, defer_size="1 KB", force=True)
+            ds = pydicom.dcmread(file_stream, defer_size="1 KB", force=False)
             if ds is None or len(ds) == 0:
                 raise ValueError("Parsed dataset has no elements.")
+        except (pydicom.errors.InvalidDicomError, TypeError):
+            file_stream.seek(cur_pos)
+            try:
+                ds = pydicom.dcmread(file_stream, defer_size="1 KB", force=True)
+                if ds is None or len(ds) == 0:
+                    raise ValueError("Parsed dataset has no elements.")
+            except Exception as e:
+                raise ValueError(f"Failed to parse DICOM file: {e}") from e
         except Exception as e:
             raise ValueError(f"Failed to parse DICOM file: {e}") from e
 
@@ -148,6 +163,7 @@ def _get_val(keyword: str) -> Any:
         # 2. Study Information
         study_fields = {
             "Study Instance UID": _get_val("StudyInstanceUID"),
+            "Study ID": _get_val("StudyID"),
             "Study Date": _format_date(_get_val("StudyDate")),
             "Study Time": _format_time(_get_val("StudyTime")),
             "Study Description": _get_val("StudyDescription"),
@@ -159,6 +175,8 @@ def _get_val(keyword: str) -> Any:
             "Series Instance UID": _get_val("SeriesInstanceUID"),
             "Series Number": _get_val("SeriesNumber"),
             "Series Description": _get_val("SeriesDescription"),
+            "Series Date": _format_date(_get_val("SeriesDate")),
+            "Series Time": _format_time(_get_val("SeriesTime")),
         }
 
         # 4. Acquisition Information
@@ -181,14 +199,11 @@ def _get_val(keyword: str) -> Any:
         }
 
         # 6. Image Characteristics
-        rows = _get_val("Rows")
-        cols = _get_val("Columns")
-        resolution = f"{rows} × {cols}" if rows and cols else None
-
         pixel_data_present = "Yes" if (0x7FE0, 0x0010) in ds else "No"
 
         image_fields = {
-            "Resolution": resolution,
+            "Rows": _get_val("Rows"),
+            "Columns": _get_val("Columns"),
             "Samples Per Pixel": _get_val("SamplesPerPixel"),
             "Bits Allocated": _get_val("BitsAllocated"),
             "Bits Stored": _get_val("BitsStored"),
@@ -196,6 +211,9 @@ def _get_val(keyword: str) -> Any:
             "Pixel Representation": _get_val("PixelRepresentation"),
             "Photometric Interpretation": _get_val("PhotometricInterpretation"),
             "Frame Count": _get_val("NumberOfFrames"),
+            "Instance Number": _get_val("InstanceNumber"),
+            "SOP Class UID": _get_val("SOPClassUID"),
+            "SOP Instance UID": _get_val("SOPInstanceUID"),
             "Pixel Data Present": pixel_data_present,
         }
 
@@ -230,15 +248,15 @@ def _render_section(title: str, fields: Dict[str, Any]) -> List[str]:
         # 8. Private / Custom textual tags when reasonable
         EXCLUDED_KEYWORDS = {
             # Study
-            "StudyInstanceUID", "StudyDate", "StudyTime", "StudyDescription", "AccessionNumber",
+            "StudyInstanceUID", "StudyDate", "StudyTime", "StudyDescription", "AccessionNumber", "StudyID",
             # Series
-            "SeriesInstanceUID", "SeriesNumber", "SeriesDescription",
+            "SeriesInstanceUID", "SeriesNumber", "SeriesDescription", "SeriesDate", "SeriesTime",
             # Acquisition
             "Modality", "ProtocolName", "Exposure", "ExposureTime", "KVP", "AcquisitionDate", "AcquisitionTime",
             # Equipment
             "Manufacturer", "ManufacturerModelName", "DeviceSerialNumber", "SoftwareVersions",
             # Image Characteristics
-            "Rows", "Columns", "SamplesPerPixel", "BitsAllocated", "BitsStored", "HighBit", "PixelRepresentation", "PhotometricInterpretation", "NumberOfFrames",
+            "Rows", "Columns", "SamplesPerPixel", "BitsAllocated", "BitsStored", "HighBit", "PixelRepresentation", "PhotometricInterpretation", "NumberOfFrames", "InstanceNumber", "SOPClassUID", "SOPInstanceUID",
             # Other Useful Text Fields
             "ImageComments", "InstitutionName", "StationName", "BodyPartExamined",
             # Patient info
@@ -276,10 +294,10 @@ def _render_section(title: str, fields: Dict[str, Any]) -> List[str]:
             if val is None or val == "":
                 continue
 
-            # Check for PII tags if redaction is enabled
-            lower_label = label.lower()
-            if redact_pii and ("patient" in lower_label or "name" in lower_label or "birth" in lower_label or "id" in lower_label):
-                if "sex" not in lower_label and "age" not in lower_label:
+            # Check for Patient PII tags if redaction is enabled (standard Patient group is 0x0010)
+            # Retain clinical demographics: PatientSex (0x0010, 0x0040) and PatientAge (0x0010, 0x1010)
+            if redact_pii:
+                if elem.tag.group == 0x0010 and elem.tag.element not in (0x0040, 0x1010):
                     continue
 
             # Format list value or other type
diff --git a/packages/markitdown-dicom/tests/test_dicom_converter.py b/packages/markitdown-dicom/tests/test_dicom_converter.py
index 138d832ac..931fa7a1b 100644
--- a/packages/markitdown-dicom/tests/test_dicom_converter.py
+++ b/packages/markitdown-dicom/tests/test_dicom_converter.py
@@ -48,6 +48,10 @@ def create_mock_dicom(
     ds.SeriesInstanceUID = generate_uid()
     ds.SOPInstanceUID = file_meta.MediaStorageSOPInstanceUID
     ds.SOPClassUID = file_meta.MediaStorageSOPClassUID
+    ds.StudyID = "STUDY-1"
+    ds.SeriesDate = "20260612"
+    ds.SeriesTime = "120500"
+    ds.InstanceNumber = 42
     ds.StudyDate = "20260612"
     ds.StudyTime = "120000.123"
     ds.StudyDescription = study_description
@@ -141,9 +145,16 @@ def test_dicom_converter_default_redaction() -> None:
 
     # Verifying other standard sections are rendered properly
     assert "Study Description**: Mock Study" in result.markdown
-    assert "Resolution**: 512 × 512" in result.markdown
+    assert "Study ID**: STUDY-1" in result.markdown
+    assert "Series Date**: 2026-06-12" in result.markdown
+    assert "Series Time**: 12:05:00" in result.markdown
+    assert "Rows**: 512" in result.markdown
+    assert "Columns**: 512" in result.markdown
+    assert "Instance Number**: 42" in result.markdown
     assert "Study Date**: 2026-06-12" in result.markdown
     assert "Study Time**: 12:00:00.123" in result.markdown
+    assert "SOP Class UID**" in result.markdown
+    assert "SOP Instance UID**" in result.markdown
 
 
 def test_dicom_converter_disabled_redaction() -> None:
@@ -175,7 +186,8 @@ def test_dicom_converter_missing_fields() -> None:
     result = converter.convert(stream, StreamInfo())
 
     # Ensure no empty field or error occurs
-    assert "Resolution" not in result.markdown
+    assert "Rows" not in result.markdown
+    assert "Columns" not in result.markdown
     assert "Study Description" not in result.markdown
     assert "Manufacturer**" in result.markdown  # Manufacturer remains since it wasn't set to None
     assert "DICOM File" in result.markdown

From 54a3993b7f5601f64ca6d46c89a3538ca328650a Mon Sep 17 00:00:00 2001
From: timburman <aryankaushik251@gmail.com>
Date: Fri, 12 Jun 2026 14:23:26 +0530
Subject: [PATCH 7/8] docs: update README example to match the new tags and
 resolution layout

---
 packages/markitdown-dicom/README.md | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/packages/markitdown-dicom/README.md b/packages/markitdown-dicom/README.md
index a30bc8d6b..0d078b6ac 100644
--- a/packages/markitdown-dicom/README.md
+++ b/packages/markitdown-dicom/README.md
@@ -70,6 +70,7 @@ result = md.convert("patient_scan.dcm")
 ## Study Information
 
 * **Study Instance UID**: 1.2.840.113619.2.134.1.20230612.98765432
+* **Study ID**: STUDY-1
 * **Study Date**: 2023-06-12
 * **Study Time**: 11:44:27
 * **Study Description**: Chest X-Ray
@@ -80,6 +81,8 @@ result = md.convert("patient_scan.dcm")
 * **Series Instance UID**: 1.2.840.113619.2.134.2.20230612.98765432
 * **Series Number**: 1
 * **Series Description**: PA View
+* **Series Date**: 2023-06-12
+* **Series Time**: 11:45:00
 
 ## Acquisition Parameters
 
@@ -100,7 +103,8 @@ result = md.convert("patient_scan.dcm")
 
 ## Image Properties
 
-* **Resolution**: 2048 × 1500
+* **Rows**: 2048
+* **Columns**: 1500
 * **Samples Per Pixel**: 1
 * **Bits Allocated**: 16
 * **Bits Stored**: 12
@@ -108,5 +112,8 @@ result = md.convert("patient_scan.dcm")
 * **Pixel Representation**: 0
 * **Photometric Interpretation**: MONOCHROME2
 * **Frame Count**: 1
+* **Instance Number**: 42
+* **SOP Class UID**: 1.2.840.10008.5.1.4.1.1.2
+* **SOP Instance UID**: 1.2.840.113619.2.134.2.20230612.98765432.1
 * **Pixel Data Present**: Yes
 ```

From 61a5fa06e243604ff48b86dbb2f7dd7510870268 Mon Sep 17 00:00:00 2001
From: timburman <aryankaushik251@gmail.com>
Date: Fri, 12 Jun 2026 14:29:44 +0530
Subject: [PATCH 8/8] docs: update readme

---
 packages/markitdown-dicom/README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/packages/markitdown-dicom/README.md b/packages/markitdown-dicom/README.md
index 0d078b6ac..ce09eac83 100644
--- a/packages/markitdown-dicom/README.md
+++ b/packages/markitdown-dicom/README.md
@@ -8,9 +8,9 @@ The plugin is designed to be highly memory-efficient (using deferred loading for
 
 - **Efficient Stream Peeking**: Fast detection of `.dcm` files by peeking at the `DICM` file preamble/magic bytes at offset 128.
 - **Memory Safety**: Uses `pydicom` with deferred value loading (`defer_size="1 KB"`) to parse headers of large multi-frame DICOM files without loading gigabytes of pixel data.
-- **HIPAA-Compliant by Default**: Automatically redacts Patient Name, Patient ID, and Patient Birth Date.
+- **PII-Aware by Default**: Automatically redacts Patient Name, Patient ID, and Patient Birth Date.
 - **Formatted Metadata**: Standardizes dates to `YYYY-MM-DD` and times to `HH:MM:SS` for downstream RAG and vector database ingestion.
-- **Custom Tag Support**: Automatically extracts and formats additional standard and private vendor tags if they are simple numbers/text.
+- **Custom Tag Support**: Automatically extracts additional standard metadata fields. Private/vendor tags can optionally be included and are filtered to avoid binary, sequence, and other high-volume data types.
 
 ## Installation