Merge 6a6ad7d2c3 into 644bbe7c6d
commit
9c37aea893
|
|
@ -34,6 +34,7 @@ from urllib.parse import uses_relative, uses_netloc, uses_params
|
||||||
|
|
||||||
_VALID_URLS = set(uses_relative + uses_netloc + uses_params)
|
_VALID_URLS = set(uses_relative + uses_netloc + uses_params)
|
||||||
_VALID_URLS.discard("")
|
_VALID_URLS.discard("")
|
||||||
|
SMOOTHING_FACTOR = 0.000001
|
||||||
|
|
||||||
|
|
||||||
# https://github.com/pandas-dev/pandas/blob/master/pandas/io/common.py
|
# https://github.com/pandas-dev/pandas/blob/master/pandas/io/common.py
|
||||||
|
|
@ -373,7 +374,7 @@ def text_in_bbox(bbox, text):
|
||||||
continue
|
continue
|
||||||
if bbox_intersect(ba, bb):
|
if bbox_intersect(ba, bb):
|
||||||
# if the intersection is larger than 80% of ba's size, we keep the longest
|
# if the intersection is larger than 80% of ba's size, we keep the longest
|
||||||
if (bbox_intersection_area(ba, bb) / bbox_area(ba)) > 0.8:
|
if (bbox_intersection_area(ba, bb) / (bbox_area(ba) + SMOOTHING_FACTOR)) > 0.8:
|
||||||
if bbox_longer(bb, ba):
|
if bbox_longer(bb, ba):
|
||||||
rest.discard(ba)
|
rest.discard(ba)
|
||||||
unique_boxes = list(rest)
|
unique_boxes = list(rest)
|
||||||
|
|
@ -623,7 +624,11 @@ def split_textline(table, textline, direction, flag_size=False, strip_text=""):
|
||||||
else:
|
else:
|
||||||
# TODO: add test
|
# TODO: add test
|
||||||
if cut == x_cuts[-1] :
|
if cut == x_cuts[-1] :
|
||||||
cut_text.append((r, cut[0] + 1, obj))
|
col = cut[0]
|
||||||
|
#avoid list out of range
|
||||||
|
if col + 1 < len(table.cols) :
|
||||||
|
col = cut[0] + 1
|
||||||
|
cut_text.append((r, col, obj))
|
||||||
elif isinstance(obj, LTAnno):
|
elif isinstance(obj, LTAnno):
|
||||||
cut_text.append((r, cut[0], obj))
|
cut_text.append((r, cut[0], obj))
|
||||||
elif direction == "vertical" and not textline.is_empty():
|
elif direction == "vertical" and not textline.is_empty():
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue