[datasets] COCO-Text V2 integration (#1888)

sarjil77 · web-flow · commit 5402948be3e6 · 2025-03-10T07:40:49.000+01:00
diff --git a/docs/source/index.rst b/docs/source/index.rst
@@ -68,7 +68,7 @@ Supported datasets
 * MJSynth from `"Synthetic Data and Artificial Neural Networks for Natural Scene Text Recognition" <https://www.robots.ox.ac.uk/~vgg/data/text/>`_.
 * IIITHWS from `"Generating Synthetic Data for Text Recognition" <https://github.com/kris314/hwnet>`_.
 * WILDRECEIPT from `"Spatial Dual-Modality Graph Reasoning for Key Information Extraction" <https://arxiv.org/pdf/2103.14470v1.pdf>`_.
-
+* COCO-Text dataset from `"COCO-Text: Dataset and Benchmark for Text Detection and Recognition in Natural Images" <https://arxiv.org/pdf/1601.07140v2>`_.   
 
 .. toctree::
    :maxdepth: 2
diff --git a/docs/source/modules/datasets.rst b/docs/source/modules/datasets.rst
@@ -36,6 +36,8 @@ doctr.datasets
 
 .. autoclass:: WILDRECEIPT
 
+.. autoclass:: COCOTEXT
+
 Synthetic dataset generator
 ---------------------------
 
diff --git a/docs/source/using_doctr/using_datasets.rst b/docs/source/using_doctr/using_datasets.rst
diff --git a/doctr/datasets/__init__.py b/doctr/datasets/__init__.py
@@ -1,6 +1,7 @@
 from doctr.file_utils import is_tf_available
 
 from .generator import *
+from .coco_text import *
 from .cord import *
 from .detection import *
 from .doc_artefacts import *
diff --git a/doctr/datasets/coco_text.py b/doctr/datasets/coco_text.py
@@ -0,0 +1,136 @@
+# Copyright (C) 2021-2025, Mindee.
+
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
+
+import json
+import os
+from pathlib import Path
+from typing import Any
+
+import numpy as np
+from tqdm import tqdm
+
+from .datasets import AbstractDataset
+from .utils import convert_target_to_relative, crop_bboxes_from_image
+
+__all__ = ["COCOTEXT"]
+
+
+class COCOTEXT(AbstractDataset):
+    """
+    COCO-Text dataset from `"COCO-Text: Dataset and Benchmark for Text Detection and Recognition in Natural Images"
+    <https://arxiv.org/pdf/1601.07140v2>`_ |
+    `"homepage" <https://bgshih.github.io/cocotext/>`_.
+    
+    >>> # NOTE: You need to download the dataset first.
+    >>> from doctr.datasets import COCOTEXT
+    >>> train_set = COCOTEXT(train=True, img_folder="/path/to/coco_text/train2014/",
+    >>>                     label_path="/path/to/coco_text/cocotext.v2.json")
+    >>> img, target = train_set[0]
+    >>> test_set = COCOTEXT(train=False, img_folder="/path/to/coco_text/train2014/",
+    >>> label_path = "/path/to/coco_text/cocotext.v2.json")
+    >>> img, target = test_set[0]
+    
+    Args:
+        img_folder: folder with all the images of the dataset
+        label_path: path to the annotations file of the dataset
+        train: whether the subset should be the training one
+        use_polygons: whether polygons should be considered as rotated bounding box (instead of straight ones)
+        recognition_task: whether the dataset should be used for recognition task
+        detection_task: whether the dataset should be used for detection task
+        **kwargs: keyword arguments from `AbstractDataset`.
+    """
+
+    def __init__(
+        self,
+        img_folder: str,
+        label_path: str,
+        train: bool = True,
+        use_polygons: bool = False,
+        recognition_task: bool = False,
+        detection_task: bool = False,
+        **kwargs: Any,
+    ) -> None:
+        super().__init__(
+            img_folder, pre_transforms=convert_target_to_relative if not recognition_task else None, **kwargs
+        )
+        # Task check 
+        if recognition_task and detection_task:
+            raise ValueError(
+                " 'recognition' and 'detection task' cannot be set to True simultaneously. "
+                + " To get the whole dataset with boxes and labels leave both parameters to False "
+            )
+
+        # File existence check
+        if not os.path.exists(label_path) or not os.path.exists(img_folder):
+            raise FileNotFoundError(f"unable to find {label_path if not os.path.exists(label_path) else img_folder}")
+
+        tmp_root = img_folder
+        self.train = train
+        np_dtype = np.float32
+        self.data: list[tuple[str | Path | np.ndarray, str | dict[str, Any] | np.ndarray]] = []
+
+        with open(label_path, "r") as file:
+            data = json.load(file)
+            
+        # Filter images based on the set
+        img_items = [img for img in data["imgs"].items() if (img[1]["set"] == "train") == train]
+
+        for img_id, img_info in tqdm(img_items, desc="Preparing and Loading COCOTEXT", total=len(img_items)):
+            img_path = os.path.join(img_folder, img_info["file_name"])
+
+            if not os.path.exists(img_path):
+                raise FileNotFoundError(f"Unable to locate {img_path}")
+
+            # Get annotations for the current image (only legible text)
+            annotations = [
+                ann
+                for ann in data["anns"].values()
+                if ann["image_id"] == int(img_id) and ann["legibility"] == "legible"
+            ]
+
+            if not annotations:  # Some images have no annotations with readable text
+                continue
+
+            _targets = []
+            
+            for annotation in annotations:
+                x, y, w, h = annotation["bbox"]
+                if use_polygons:
+                    # (x, y) coordinates of top left, top right, bottom right, bottom left corners
+                    box = np.array(
+                        [
+                            [x, y],
+                            [x + w, y],
+                            [x + w, y + h],
+                            [x, y + h],
+                        ],
+                        dtype=np_dtype,
+                    )
+                else:
+                    # (xmin, ymin, xmax, ymax) coordinates
+                    box = [x, y, x + w, y + h]
+                _targets.append((annotation["utf8_string"], box))
+            text_targets, box_targets = zip(*_targets)
+
+            if recognition_task:
+                crops = crop_bboxes_from_image(
+                    img_path=os.path.join(tmp_root, img_path), geoms=np.asarray(box_targets, dtype=int).clip(min=0)
+                )
+                for crop, label in zip(crops, list(text_targets)):
+                    if label and " " not in label:
+                        self.data.append((crop, label))
+
+            elif detection_task:
+                self.data.append((img_path, np.asarray(box_targets, dtype=int).clip(min=0)))
+            else:
+                self.data.append((
+                    img_path,
+                    dict(boxes=np.asarray(box_targets, dtype=int).clip(min=0), labels=list(text_targets)),
+                ))
+
+        self.root = tmp_root
+
+    def extra_repr(self) -> str:
+        return f"train={self.train}"
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -711,3 +711,73 @@ def mock_wildreceipt_dataset(tmpdir_factory, mock_image_stream):
         with open(fn_i, "wb") as f:
             f.write(file.getbuffer())
     return str(image_folder), str(annotation_file)
+
+
+@pytest.fixture(scope="session")
+def mock_cocotext_dataset(tmpdir_factory, mock_image_stream):
+    file = BytesIO(mock_image_stream)
+    root = tmpdir_factory.mktemp("datasets")
+    cocotext_root = root.mkdir("cocotext")
+    annotations_folder = cocotext_root
+    image_folder = cocotext_root.mkdir("train2014")
+
+    filenames = [
+        "COCO_train2014_000000353709.jpg",
+        "COCO_train2014_000000077346.jpg",
+        "COCO_train2014_000000437996.jpg",
+    ]
+    labels = {
+        "cats": {},
+        "anns": {
+            "1": {
+                "mask": [286.1, 215.5, 285.2, 221.5, 304.6, 222.0, 304.6, 216.9],
+                "class": "machine printed",
+                "bbox": [285.2, 215.5, 19.4, 6.5],
+                "image_id": 367969,
+                "id": 108418,
+                "language": "english",
+                "area": 105.6,
+                "utf8_string": "GATO",
+                "legibility": "legible",
+            },
+            "2": {
+                "mask": [310.4, 304.6, 319.4, 302.1, 323.2, 318.1, 307.2, 318.1],
+                "class": "machine printed",
+                "bbox": [307.2, 302.1, 16.0, 16.0],
+                "image_id": 77346,
+                "id": 196817,
+                "language": "english",
+                "area": 184.75,
+                "utf8_string": "6",
+                "legibility": "legible",
+            },
+            "3": {
+                "mask": [212.6, 245.8, 210.1, 248.6, 212.0, 262.8, 221.9, 260.9, 227.4, 244.6],
+                "class": "machine printed",
+                "bbox": [210.1, 244.6, 17.3, 18.2],
+                "image_id": 437996,
+                "id": 134765,
+                "language": "english",
+                "area": 221.31,
+                "utf8_string": "17",
+                "legibility": "legible",
+            },
+        },
+        "imgs": {
+            "367969": {"id": 367969, "set": "train", "width": 640, "file_name": f"{filenames[0]}", "height": 427},
+            "77346": {"id": 77346, "set": "train", "width": 640, "file_name": f"{filenames[1]}", "height": 427},
+            "437996": {"id": 437996, "set": "train", "width": 640, "file_name": f"{filenames[2]}", "height": 427},
+        },
+        "imgToAnns": {},
+        "info": {},
+    }
+
+    annotation_file = annotations_folder.join("cocotext.v2.json")
+    with open(annotation_file, "w") as f:
+        json.dump(labels, f)
+    file = BytesIO(mock_image_stream)
+    for img_name in filenames:
+        fn = image_folder.join(f"{img_name}")
+        with open(fn, "wb") as f:
+            f.write(file.getbuffer())
+    return str(image_folder), str(annotation_file)
diff --git a/tests/pytorch/test_datasets_pt.py b/tests/pytorch/test_datasets_pt.py
@@ -760,6 +760,37 @@ def test_wildreceipt_dataset(input_size, num_samples, rotate, recognition, detec
         datasets.WILDRECEIPT(*mock_wildreceipt_dataset, train=True, recognition_task=True, detection_task=True)
 
 
+@pytest.mark.parametrize("rotate", [True, False])
+@pytest.mark.parametrize(
+    "input_size, num_samples, recognition, detection",
+    [
+        [[512, 512], 3, False, False],  # Actual set has 13880 training samples and 3261 test samples
+        [[32, 128], 3, True, False],  # recognition
+        [[512, 512], 3, False, True],  # detection
+    ],
+)
+def test_cocotext_dataset(input_size, num_samples, rotate, recognition, detection, mock_cocotext_dataset):
+    ds = datasets.COCOTEXT(
+        *mock_cocotext_dataset,
+        train=True,
+        img_transforms=Resize(input_size),
+        use_polygons=rotate,
+        recognition_task=recognition,
+        detection_task=detection,
+    )
+    assert len(ds) == num_samples
+    assert repr(ds) == f"COCOTEXT(train={True})"
+    if recognition:
+        _validate_dataset_recognition_part(ds, input_size)
+    elif detection:
+        _validate_dataset_detection_part(ds, input_size, is_polygons=rotate)
+    else:
+        _validate_dataset(ds, input_size, is_polygons=rotate)
+
+    with pytest.raises(ValueError):
+        datasets.COCOTEXT(*mock_cocotext_dataset, train=True, recognition_task=True, detection_task=True)
+
+
 # NOTE: following datasets are only for recognition task
 
 
diff --git a/tests/tensorflow/test_datasets_tf.py b/tests/tensorflow/test_datasets_tf.py
@@ -733,6 +733,37 @@ def test_wildreceipt_dataset(input_size, num_samples, rotate, recognition, detec
         datasets.WILDRECEIPT(*mock_wildreceipt_dataset, train=True, recognition_task=True, detection_task=True)
 
 
+@pytest.mark.parametrize("rotate", [True, False])
+@pytest.mark.parametrize(
+    "input_size, num_samples, recognition, detection",
+    [
+        [[512, 512], 3, False, False],  # Actual set has 1268 training samples and 3261 test samples
+        [[32, 128], 3, True, False],  # recognition
+        [[512, 512], 3, False, True],  # detection
+    ],
+)
+def test_cocotext_dataset(input_size, num_samples, rotate, recognition, detection, mock_cocotext_dataset):
+    ds = datasets.COCOTEXT(
+        *mock_cocotext_dataset,
+        train=True,
+        img_transforms=Resize(input_size),
+        use_polygons=rotate,
+        recognition_task=recognition,
+        detection_task=detection,
+    )
+    assert len(ds) == num_samples
+    assert repr(ds) == f"COCOTEXT(train={True})"
+    if recognition:
+        _validate_dataset_recognition_part(ds, input_size)
+    elif detection:
+        _validate_dataset_detection_part(ds, input_size, is_polygons=rotate)
+    else:
+        _validate_dataset(ds, input_size, is_polygons=rotate)
+
+    with pytest.raises(ValueError):
+        datasets.COCOTEXT(*mock_cocotext_dataset, train=True, recognition_task=True, detection_task=True)
+
+
 # NOTE: following datasets are only for recognition task