1
+ # Copyright (C) 2021-2025, Mindee.
2
+
3
+ # This program is licensed under the Apache License 2.0.
4
+ # See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
5
+
6
+ import json
7
+ import os
8
+ from pathlib import Path
9
+ from typing import Any
10
+
11
+ import numpy as np
12
+ from tqdm import tqdm
13
+
14
+ from .datasets import AbstractDataset
15
+ from .utils import convert_target_to_relative , crop_bboxes_from_image
16
+
17
+ __all__ = ["COCOTEXT" ]
18
+
19
+
20
+ class COCOTEXT (AbstractDataset ):
21
+ """
22
+ COCO-Text dataset from `"COCO-Text: Dataset and Benchmark for Text Detection and Recognition in Natural Images"
23
+ <https://arxiv.org/pdf/1601.07140v2>`_ |
24
+ `"homepage" <https://bgshih.github.io/cocotext/>`_.
25
+
26
+ >>> # NOTE: You need to download the dataset first.
27
+ >>> from doctr.datasets import COCOTEXT
28
+ >>> train_set = COCOTEXT(train=True, img_folder="/path/to/coco_text/train2014/",
29
+ >>> label_path="/path/to/coco_text/cocotext.v2.json")
30
+ >>> img, target = train_set[0]
31
+ >>> test_set = COCOTEXT(train=False, img_folder="/path/to/coco_text/train2014/",
32
+ >>> label_path = "/path/to/coco_text/cocotext.v2.json")
33
+ >>> img, target = test_set[0]
34
+
35
+ Args:
36
+ img_folder: folder with all the images of the dataset
37
+ label_path: path to the annotations file of the dataset
38
+ train: whether the subset should be the training one
39
+ use_polygons: whether polygons should be considered as rotated bounding box (instead of straight ones)
40
+ recognition_task: whether the dataset should be used for recognition task
41
+ detection_task: whether the dataset should be used for detection task
42
+ **kwargs: keyword arguments from `AbstractDataset`.
43
+ """
44
+
45
+ def __init__ (
46
+ self ,
47
+ img_folder : str ,
48
+ label_path : str ,
49
+ train : bool = True ,
50
+ use_polygons : bool = False ,
51
+ recognition_task : bool = False ,
52
+ detection_task : bool = False ,
53
+ ** kwargs : Any ,
54
+ ) -> None :
55
+ super ().__init__ (
56
+ img_folder , pre_transforms = convert_target_to_relative if not recognition_task else None , ** kwargs
57
+ )
58
+ # Task check
59
+ if recognition_task and detection_task :
60
+ raise ValueError (
61
+ " 'recognition' and 'detection task' cannot be set to True simultaneously. "
62
+ + " To get the whole dataset with boxes and labels leave both parameters to False "
63
+ )
64
+
65
+ # File existence check
66
+ if not os .path .exists (label_path ) or not os .path .exists (img_folder ):
67
+ raise FileNotFoundError (f"unable to find { label_path if not os .path .exists (label_path ) else img_folder } " )
68
+
69
+ tmp_root = img_folder
70
+ self .train = train
71
+ np_dtype = np .float32
72
+ self .data : list [tuple [str | Path | np .ndarray , str | dict [str , Any ] | np .ndarray ]] = []
73
+
74
+ with open (label_path , "r" ) as file :
75
+ data = json .load (file )
76
+
77
+ # Filter images based on the set
78
+ img_items = [img for img in data ["imgs" ].items () if (img [1 ]["set" ] == "train" ) == train ]
79
+
80
+ for img_id , img_info in tqdm (img_items , desc = "Preparing and Loading COCOTEXT" , total = len (img_items )):
81
+ img_path = os .path .join (img_folder , img_info ["file_name" ])
82
+
83
+ if not os .path .exists (img_path ):
84
+ raise FileNotFoundError (f"Unable to locate { img_path } " )
85
+
86
+ # Get annotations for the current image (only legible text)
87
+ annotations = [
88
+ ann
89
+ for ann in data ["anns" ].values ()
90
+ if ann ["image_id" ] == int (img_id ) and ann ["legibility" ] == "legible"
91
+ ]
92
+
93
+ if not annotations : # Some images have no annotations with readable text
94
+ continue
95
+
96
+ _targets = []
97
+
98
+ for annotation in annotations :
99
+ x , y , w , h = annotation ["bbox" ]
100
+ if use_polygons :
101
+ # (x, y) coordinates of top left, top right, bottom right, bottom left corners
102
+ box = np .array (
103
+ [
104
+ [x , y ],
105
+ [x + w , y ],
106
+ [x + w , y + h ],
107
+ [x , y + h ],
108
+ ],
109
+ dtype = np_dtype ,
110
+ )
111
+ else :
112
+ # (xmin, ymin, xmax, ymax) coordinates
113
+ box = [x , y , x + w , y + h ]
114
+ _targets .append ((annotation ["utf8_string" ], box ))
115
+ text_targets , box_targets = zip (* _targets )
116
+
117
+ if recognition_task :
118
+ crops = crop_bboxes_from_image (
119
+ img_path = os .path .join (tmp_root , img_path ), geoms = np .asarray (box_targets , dtype = int ).clip (min = 0 )
120
+ )
121
+ for crop , label in zip (crops , list (text_targets )):
122
+ if label and " " not in label :
123
+ self .data .append ((crop , label ))
124
+
125
+ elif detection_task :
126
+ self .data .append ((img_path , np .asarray (box_targets , dtype = int ).clip (min = 0 )))
127
+ else :
128
+ self .data .append ((
129
+ img_path ,
130
+ dict (boxes = np .asarray (box_targets , dtype = int ).clip (min = 0 ), labels = list (text_targets )),
131
+ ))
132
+
133
+ self .root = tmp_root
134
+
135
+ def extra_repr (self ) -> str :
136
+ return f"train={ self .train } "
0 commit comments