pull/218/merge
guet3401 2021-10-03 05:35:48 -07:00 committed by GitHub
commit 9c37aea893
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
1 changed files with 8 additions and 3 deletions

View File

@ -34,6 +34,7 @@ from urllib.parse import uses_relative, uses_netloc, uses_params
_VALID_URLS = set(uses_relative + uses_netloc + uses_params) _VALID_URLS = set(uses_relative + uses_netloc + uses_params)
_VALID_URLS.discard("") _VALID_URLS.discard("")
SMOOTHING_FACTOR = 0.000001
# https://github.com/pandas-dev/pandas/blob/master/pandas/io/common.py # https://github.com/pandas-dev/pandas/blob/master/pandas/io/common.py
@ -373,7 +374,7 @@ def text_in_bbox(bbox, text):
continue continue
if bbox_intersect(ba, bb): if bbox_intersect(ba, bb):
# if the intersection is larger than 80% of ba's size, we keep the longest # if the intersection is larger than 80% of ba's size, we keep the longest
if (bbox_intersection_area(ba, bb) / bbox_area(ba)) > 0.8: if (bbox_intersection_area(ba, bb) / (bbox_area(ba) + SMOOTHING_FACTOR)) > 0.8:
if bbox_longer(bb, ba): if bbox_longer(bb, ba):
rest.discard(ba) rest.discard(ba)
unique_boxes = list(rest) unique_boxes = list(rest)
@ -622,8 +623,12 @@ def split_textline(table, textline, direction, flag_size=False, strip_text=""):
break break
else: else:
# TODO: add test # TODO: add test
if cut == x_cuts[-1]: if cut == x_cuts[-1] :
cut_text.append((r, cut[0] + 1, obj)) col = cut[0]
#avoid list out of range
if col + 1 < len(table.cols) :
col = cut[0] + 1
cut_text.append((r, col, obj))
elif isinstance(obj, LTAnno): elif isinstance(obj, LTAnno):
cut_text.append((r, cut[0], obj)) cut_text.append((r, cut[0], obj))
elif direction == "vertical" and not textline.is_empty(): elif direction == "vertical" and not textline.is_empty():