Fix #15 extraction of cell data discarding overlapping text boxes
parent
8ca30f3a3c
commit
7695d35449
|
|
@ -353,7 +353,7 @@ def text_in_bbox(bbox, text):
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
t_bbox : list
|
t_bbox : list
|
||||||
List of PDFMiner text objects that lie inside table.
|
List of PDFMiner text objects that lie inside table, discarding the overlapping ones
|
||||||
|
|
||||||
"""
|
"""
|
||||||
lb = (bbox[0], bbox[1])
|
lb = (bbox[0], bbox[1])
|
||||||
|
|
@ -364,7 +364,97 @@ def text_in_bbox(bbox, text):
|
||||||
if lb[0] - 2 <= (t.x0 + t.x1) / 2.0 <= rt[0] + 2
|
if lb[0] - 2 <= (t.x0 + t.x1) / 2.0 <= rt[0] + 2
|
||||||
and lb[1] - 2 <= (t.y0 + t.y1) / 2.0 <= rt[1] + 2
|
and lb[1] - 2 <= (t.y0 + t.y1) / 2.0 <= rt[1] + 2
|
||||||
]
|
]
|
||||||
return t_bbox
|
|
||||||
|
# Avoid duplicate text by discarding overlapping boxes
|
||||||
|
rest = {t for t in t_bbox}
|
||||||
|
for ba in t_bbox:
|
||||||
|
for bb in rest.copy():
|
||||||
|
if ba == bb:
|
||||||
|
continue
|
||||||
|
if bbox_intersect(ba, bb):
|
||||||
|
# if the intersection is larger than 80% of ba's size, we keep the longest
|
||||||
|
if (bbox_intersection_area(ba, bb) / bbox_area(ba)) > 0.8:
|
||||||
|
if bbox_longer(bb, ba):
|
||||||
|
rest.discard(ba)
|
||||||
|
unique_boxes = list(rest)
|
||||||
|
|
||||||
|
return unique_boxes
|
||||||
|
|
||||||
|
|
||||||
|
def bbox_intersection_area(ba, bb) -> float:
|
||||||
|
"""Returns area of the intersection of the bounding boxes of two PDFMiner objects.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
ba : PDFMiner text object
|
||||||
|
bb : PDFMiner text object
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
intersection_area : float
|
||||||
|
Area of the intersection of the bounding boxes of both objects
|
||||||
|
|
||||||
|
"""
|
||||||
|
x_left = max(ba.x0, bb.x0)
|
||||||
|
y_top = min(ba.y1, bb.y1)
|
||||||
|
x_right = min(ba.x1, bb.x1)
|
||||||
|
y_bottom = max(ba.y0, bb.y0)
|
||||||
|
|
||||||
|
if x_right < x_left or y_bottom > y_top:
|
||||||
|
return 0.0
|
||||||
|
|
||||||
|
intersection_area = (x_right - x_left) * (y_top - y_bottom)
|
||||||
|
return intersection_area
|
||||||
|
|
||||||
|
|
||||||
|
def bbox_area(bb) -> float:
|
||||||
|
"""Returns area of the bounding box of a PDFMiner object.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
bb : PDFMiner text object
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
area : float
|
||||||
|
Area of the bounding box of the object
|
||||||
|
|
||||||
|
"""
|
||||||
|
return (bb.x1 - bb.x0) * (bb.y1 - bb.y0)
|
||||||
|
|
||||||
|
|
||||||
|
def bbox_intersect(ba, bb) -> bool:
|
||||||
|
"""Returns True if the bounding boxes of two PDFMiner objects intersect.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
ba : PDFMiner text object
|
||||||
|
bb : PDFMiner text object
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
overlaps : bool
|
||||||
|
True if the bounding boxes intersect
|
||||||
|
|
||||||
|
"""
|
||||||
|
return ba.x1 >= bb.x0 and bb.x1 >= ba.x0 and ba.y1 >= bb.y0 and bb.y1 >= ba.y0
|
||||||
|
|
||||||
|
|
||||||
|
def bbox_longer(ba, bb) -> bool:
|
||||||
|
"""Returns True if the bounding box of the first PDFMiner object is longer or equal to the second.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
ba : PDFMiner text object
|
||||||
|
bb : PDFMiner text object
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
longer : bool
|
||||||
|
True if the bounding box of the first object is longer or equal
|
||||||
|
|
||||||
|
"""
|
||||||
|
return (ba.x1 - ba.x0) >= (bb.x1 - bb.x0)
|
||||||
|
|
||||||
|
|
||||||
def merge_close_lines(ar, line_tol=2):
|
def merge_close_lines(ar, line_tol=2):
|
||||||
|
|
|
||||||
|
|
@ -2798,3 +2798,51 @@ data_stream_layout_kwargs = [
|
||||||
["A.O.P Cornas", ""],
|
["A.O.P Cornas", ""],
|
||||||
["Domaine Lionnet « Terre Brûlée » 2012", "15 €"],
|
["Domaine Lionnet « Terre Brûlée » 2012", "15 €"],
|
||||||
]
|
]
|
||||||
|
|
||||||
|
data_stream_duplicated_text = [
|
||||||
|
['', '2012 BETTER VARIETIES Harvest Report for Minnesota Central [ MNCE ]', '', '', '', '', '', '', '', '',
|
||||||
|
'ALL SEASON TEST'],
|
||||||
|
['', 'Doug Toreen, Renville County, MN 55310 [ BIRD ISLAND ]', '', '', '', '', '', '', '', '',
|
||||||
|
'1.3 - 2.0 MAT. GROUP'],
|
||||||
|
['PREV. CROP/HERB:', 'Corn / Surpass, Roundup', '', '', '', '', '', '', '', '', 'S2MNCE01'],
|
||||||
|
['SOIL DESCRIPTION:', '', 'Canisteo clay loam, mod. well drained, non-irrigated', '', '', '', '', '', '', '', ''],
|
||||||
|
['SOIL CONDITIONS:', '', 'High P, high K, 6.7 pH, 3.9% OM, Low SCN', '', '', '', '', '', '', '', '30" ROW SPACING'],
|
||||||
|
['TILLAGE/CULTIVATION:', 'conventional w/ fall till', '', '', '', '', '', '', '', '', ''],
|
||||||
|
['PEST MANAGEMENT:', 'Roundup twice', '', '', '', '', '', '', '', '', ''],
|
||||||
|
['SEEDED - RATE:', 'May 15', '140 000 /A', '', '', '', '', '', '', 'TOP 30 for YIELD of 63 TESTED', ''],
|
||||||
|
['HARVESTED - STAND:', 'Oct 3', '122 921 /A', '', '', '', '', '', '', 'AVERAGE of (3) REPLICATIONS', ''],
|
||||||
|
['', '', '', '', 'SCN', 'Seed', 'Yield', 'Moisture', 'Lodging', 'Stand', 'Gross'],
|
||||||
|
['Company/Brand', 'Product/Brand†', 'Technol.†', 'Mat.', 'Resist.', 'Trmt.†', 'Bu/A', '%', '%', '(x 1000)',
|
||||||
|
'Income'], ['Kruger', 'K2 1901', 'RR2Y', '1.9', 'R', 'Ac,PV', '56.4', '7.6', '0', '126.3', '$846'],
|
||||||
|
['Stine', '19RA02 §', 'RR2Y', '1.9', 'R', 'CMB', '55.3', '7.6', '0', '120.0', '$830'],
|
||||||
|
['Wensman', 'W 3190NR2', 'RR2Y', '1.9', 'R', 'Ac', '54.5', '7.6', '0', '119.5', '$818'],
|
||||||
|
['Hefty', 'H17Y12', 'RR2Y', '1.7', 'MR', 'I', '53.7', '7.7', '0', '124.4', '$806'],
|
||||||
|
['Dyna-Gro', 'S15RY53', 'RR2Y', '1.5', 'R', 'Ac', '53.6', '7.7', '0', '126.8', '$804'],
|
||||||
|
['LG Seeds', 'C2050R2', 'RR2Y', '2.1', 'R', 'Ac', '53.6', '7.7', '0', '123.9', '$804'],
|
||||||
|
['Titan Pro', '19M42', 'RR2Y', '1.9', 'R', 'CMB', '53.6', '7.7', '0', '121.0', '$804'],
|
||||||
|
['Stine', '19RA02 (2) §', 'RR2Y', '1.9', 'R', 'CMB', '53.4', '7.7', '0', '123.9', '$801'],
|
||||||
|
['Asgrow', 'AG1832 §', 'RR2Y', '1.8', 'MR', 'Ac,PV', '52.9', '7.7', '0', '122.0', '$794'],
|
||||||
|
['Prairie Brand', 'PB-1566R2', 'RR2Y', '1.5', 'R', 'CMB', '52.8', '7.7', '0', '122.9', '$792'],
|
||||||
|
['Channel', '1901R2', 'RR2Y', '1.9', 'R', 'Ac,PV', '52.8', '7.6', '0', '123.4', '$791'],
|
||||||
|
['Titan Pro', '20M1', 'RR2Y', '2.0', 'R', 'Am', '52.5', '7.5', '0', '124.4', '$788'],
|
||||||
|
['Kruger', 'K2-2002', 'RR2Y', '2.0', 'R', 'Ac,PV', '52.4', '7.9', '0', '125.4', '$786'],
|
||||||
|
['Channel', '1700R2', 'RR2Y', '1.7', 'R', 'Ac,PV', '52.3', '7.9', '0', '123.9', '$784'],
|
||||||
|
['Hefty', 'H16Y11', 'RR2Y', '1.6', 'MR', 'I', '51.4', '7.6', '0', '123.9', '$771'],
|
||||||
|
['Anderson', '162R2Y', 'RR2Y', '1.6', 'R', 'None', '51.3', '7.5', '0', '119.5', '$770'],
|
||||||
|
['Titan Pro', '15M22', 'RR2Y', '1.5', 'R', 'CMB', '51.3', '7.8', '0', '125.4', '$769'],
|
||||||
|
['Dairyland', 'DSR-1710R2Y', 'RR2Y', '1.7', 'R', 'CMB', '51.3', '7.7', '0', '122.0', '$769'],
|
||||||
|
['Hefty', 'H20R3', 'RR2Y', '2.0', 'MR', 'I', '50.5', '8.2', '0', '121.0', '$757'],
|
||||||
|
['Prairie Brand', 'PB 1743R2', 'RR2Y', '1.7', 'R', 'CMB', '50.2', '7.7', '0', '125.8', '$752'],
|
||||||
|
['Gold Country', '1741', 'RR2Y', '1.7', 'R', 'Ac', '50.1', '7.8', '0', '123.9', '$751'],
|
||||||
|
['Trelay', '20RR43', 'RR2Y', '2.0', 'R', 'Ac,Ex', '49.9', '7.6', '0', '127.8', '$749'],
|
||||||
|
['Hefty', 'H14R3', 'RR2Y', '1.4', 'MR', 'I', '49.7', '7.7', '0', '122.9', '$746'],
|
||||||
|
['Prairie Brand', 'PB-2099NRR2', 'RR2Y', '2.0', 'R', 'CMB', '49.6', '7.8', '0', '126.3', '$743'],
|
||||||
|
['Wensman', 'W 3174NR2', 'RR2Y', '1.7', 'R', 'Ac', '49.3', '7.6', '0', '122.5', '$740'],
|
||||||
|
['Kruger', 'K2 1602', 'RR2Y', '1.6', 'R', 'Ac,PV', '48.7', '7.6', '0', '125.4', '$731'],
|
||||||
|
['NK Brand', 'S18-C2 §', 'RR2Y', '1.8', 'R', 'CMB', '48.7', '7.7', '0', '126.8', '$731'],
|
||||||
|
['Kruger', 'K2 1902', 'RR2Y', '1.9', 'R', 'Ac,PV', '48.7', '7.5', '0', '124.4', '$730'],
|
||||||
|
['Prairie Brand', 'PB-1823R2', 'RR2Y', '1.8', 'R', 'None', '48.5', '7.6', '0', '121.0', '$727'],
|
||||||
|
['Gold Country', '1541', 'RR2Y', '1.5', 'R', 'Ac', '48.4', '7.6', '0', '110.4', '$726'],
|
||||||
|
['', '', '', '', '', 'Test Average =', '47.6', '7.7', '0', '122.9', '$713'],
|
||||||
|
['', '', '', '', '', 'LSD (0.10) =', '5.7', '0.3', 'ns', '37.8', '566.4']
|
||||||
|
]
|
||||||
|
|
|
||||||
Binary file not shown.
|
|
@ -314,3 +314,11 @@ def test_version_generation_with_prerelease_revision():
|
||||||
generate_version(version, prerelease=prerelease, revision=revision)
|
generate_version(version, prerelease=prerelease, revision=revision)
|
||||||
== "0.7.3-alpha.2"
|
== "0.7.3-alpha.2"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def test_stream_duplicated_text():
|
||||||
|
df = pd.DataFrame(data_stream_duplicated_text)
|
||||||
|
|
||||||
|
filename = os.path.join(testdir, "birdisland.pdf")
|
||||||
|
tables = camelot.read_pdf(filename, flavor="stream")
|
||||||
|
assert_frame_equal(df, tables[0].df)
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue