Skip to content

Commit 2de5a79

Browse files
authored
Merge pull request #2251 from myhloli/dev
feat(pdf_parse): add footnote block handling in layout split
2 parents cfa9074 + 058d318 commit 2de5a79

File tree

2 files changed

+9
-5
lines changed

2 files changed

+9
-5
lines changed

Diff for: magic_pdf/pdf_parse_union_core_v2.py

+8-4
Original file line numberDiff line numberDiff line change
@@ -490,7 +490,7 @@ def insert_lines_into_block(block_bbox, line_height, page_w, page_h):
490490
return [[x0, y0, x1, y1]]
491491

492492

493-
def sort_lines_by_model(fix_blocks, page_w, page_h, line_height):
493+
def sort_lines_by_model(fix_blocks, page_w, page_h, line_height, footnote_blocks):
494494
page_line_list = []
495495

496496
def add_lines_to_block(b):
@@ -519,6 +519,10 @@ def add_lines_to_block(b):
519519
block['real_lines'] = copy.deepcopy(block['lines'])
520520
add_lines_to_block(block)
521521

522+
for block in footnote_blocks:
523+
footnote_block = {'bbox': block[:4]}
524+
add_lines_to_block(footnote_block)
525+
522526
if len(page_line_list) > 200: # layoutreader最高支持512line
523527
return None
524528

@@ -779,7 +783,7 @@ def merge_two_blocks(b1, b2):
779783
# interline_equation_blocks参数不够准,后面切换到interline_equations上
780784
interline_equation_blocks = []
781785
if len(interline_equation_blocks) > 0:
782-
all_bboxes, all_discarded_blocks = ocr_prepare_bboxes_for_layout_split_v2(
786+
all_bboxes, all_discarded_blocks, footnote_blocks = ocr_prepare_bboxes_for_layout_split_v2(
783787
img_body_blocks, img_caption_blocks, img_footnote_blocks,
784788
table_body_blocks, table_caption_blocks, table_footnote_blocks,
785789
discarded_blocks,
@@ -790,7 +794,7 @@ def merge_two_blocks(b1, b2):
790794
page_h,
791795
)
792796
else:
793-
all_bboxes, all_discarded_blocks = ocr_prepare_bboxes_for_layout_split_v2(
797+
all_bboxes, all_discarded_blocks, footnote_blocks = ocr_prepare_bboxes_for_layout_split_v2(
794798
img_body_blocks, img_caption_blocks, img_footnote_blocks,
795799
table_body_blocks, table_caption_blocks, table_footnote_blocks,
796800
discarded_blocks,
@@ -866,7 +870,7 @@ def merge_two_blocks(b1, b2):
866870
line_height = get_line_height(fix_blocks)
867871

868872
"""获取所有line并对line排序"""
869-
sorted_bboxes = sort_lines_by_model(fix_blocks, page_w, page_h, line_height)
873+
sorted_bboxes = sort_lines_by_model(fix_blocks, page_w, page_h, line_height, footnote_blocks)
870874

871875
"""根据line的中位数算block的序列关系"""
872876
fix_blocks = cal_block_index(fix_blocks, sorted_bboxes)

Diff for: magic_pdf/pre_proc/ocr_detect_all_bboxes.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -119,7 +119,7 @@ def ocr_prepare_bboxes_for_layout_split_v2(
119119
"""将剩余的bbox做分离处理,防止后面分layout时出错"""
120120
# all_bboxes, drop_reasons = remove_overlap_between_bbox_for_block(all_bboxes)
121121
all_bboxes.sort(key=lambda x: x[0]+x[1])
122-
return all_bboxes, all_discarded_blocks
122+
return all_bboxes, all_discarded_blocks, footnote_blocks
123123

124124

125125
def find_blocks_under_footnote(all_bboxes, footnote_blocks):

0 commit comments

Comments
 (0)