Fix #15 extraction of cell data discarding overlapping text boxes

pull/206/head
Eduardo Gonzalez Lopez de Murillas 2020-10-27 17:51:24 +01:00
parent 8ca30f3a3c
commit 7695d35449
4 changed files with 148 additions and 2 deletions

View File

@ -353,7 +353,7 @@ def text_in_bbox(bbox, text):
Returns
-------
t_bbox : list
List of PDFMiner text objects that lie inside table.
List of PDFMiner text objects that lie inside table, discarding the overlapping ones
"""
lb = (bbox[0], bbox[1])
@ -364,7 +364,97 @@ def text_in_bbox(bbox, text):
if lb[0] - 2 <= (t.x0 + t.x1) / 2.0 <= rt[0] + 2
and lb[1] - 2 <= (t.y0 + t.y1) / 2.0 <= rt[1] + 2
]
return t_bbox
# Avoid duplicate text by discarding overlapping boxes
rest = {t for t in t_bbox}
for ba in t_bbox:
for bb in rest.copy():
if ba == bb:
continue
if bbox_intersect(ba, bb):
# if the intersection is larger than 80% of ba's size, we keep the longest
if (bbox_intersection_area(ba, bb) / bbox_area(ba)) > 0.8:
if bbox_longer(bb, ba):
rest.discard(ba)
unique_boxes = list(rest)
return unique_boxes
def bbox_intersection_area(ba, bb) -> float:
"""Returns area of the intersection of the bounding boxes of two PDFMiner objects.
Parameters
----------
ba : PDFMiner text object
bb : PDFMiner text object
Returns
-------
intersection_area : float
Area of the intersection of the bounding boxes of both objects
"""
x_left = max(ba.x0, bb.x0)
y_top = min(ba.y1, bb.y1)
x_right = min(ba.x1, bb.x1)
y_bottom = max(ba.y0, bb.y0)
if x_right < x_left or y_bottom > y_top:
return 0.0
intersection_area = (x_right - x_left) * (y_top - y_bottom)
return intersection_area
def bbox_area(bb) -> float:
"""Returns area of the bounding box of a PDFMiner object.
Parameters
----------
bb : PDFMiner text object
Returns
-------
area : float
Area of the bounding box of the object
"""
return (bb.x1 - bb.x0) * (bb.y1 - bb.y0)
def bbox_intersect(ba, bb) -> bool:
"""Returns True if the bounding boxes of two PDFMiner objects intersect.
Parameters
----------
ba : PDFMiner text object
bb : PDFMiner text object
Returns
-------
overlaps : bool
True if the bounding boxes intersect
"""
return ba.x1 >= bb.x0 and bb.x1 >= ba.x0 and ba.y1 >= bb.y0 and bb.y1 >= ba.y0
def bbox_longer(ba, bb) -> bool:
"""Returns True if the bounding box of the first PDFMiner object is longer or equal to the second.
Parameters
----------
ba : PDFMiner text object
bb : PDFMiner text object
Returns
-------
longer : bool
True if the bounding box of the first object is longer or equal
"""
return (ba.x1 - ba.x0) >= (bb.x1 - bb.x0)
def merge_close_lines(ar, line_tol=2):

View File

@ -2798,3 +2798,51 @@ data_stream_layout_kwargs = [
["A.O.P Cornas", ""],
["Domaine Lionnet « Terre Brûlée » 2012", "15 €"],
]
data_stream_duplicated_text = [
['', '2012 BETTER VARIETIES Harvest Report for Minnesota Central [ MNCE ]', '', '', '', '', '', '', '', '',
'ALL SEASON TEST'],
['', 'Doug Toreen, Renville County, MN 55310 [ BIRD ISLAND ]', '', '', '', '', '', '', '', '',
'1.3 - 2.0 MAT. GROUP'],
['PREV. CROP/HERB:', 'Corn / Surpass, Roundup', '', '', '', '', '', '', '', '', 'S2MNCE01'],
['SOIL DESCRIPTION:', '', 'Canisteo clay loam, mod. well drained, non-irrigated', '', '', '', '', '', '', '', ''],
['SOIL CONDITIONS:', '', 'High P, high K, 6.7 pH, 3.9% OM, Low SCN', '', '', '', '', '', '', '', '30" ROW SPACING'],
['TILLAGE/CULTIVATION:', 'conventional w/ fall till', '', '', '', '', '', '', '', '', ''],
['PEST MANAGEMENT:', 'Roundup twice', '', '', '', '', '', '', '', '', ''],
['SEEDED - RATE:', 'May 15', '140 000 /A', '', '', '', '', '', '', 'TOP 30 for YIELD of 63 TESTED', ''],
['HARVESTED - STAND:', 'Oct 3', '122 921 /A', '', '', '', '', '', '', 'AVERAGE of (3) REPLICATIONS', ''],
['', '', '', '', 'SCN', 'Seed', 'Yield', 'Moisture', 'Lodging', 'Stand', 'Gross'],
['Company/Brand', 'Product/Brand†', 'Technol.†', 'Mat.', 'Resist.', 'Trmt.†', 'Bu/A', '%', '%', '(x 1000)',
'Income'], ['Kruger', 'K2 1901', 'RR2Y', '1.9', 'R', 'Ac,PV', '56.4', '7.6', '0', '126.3', '$846'],
['Stine', '19RA02 §', 'RR2Y', '1.9', 'R', 'CMB', '55.3', '7.6', '0', '120.0', '$830'],
['Wensman', 'W 3190NR2', 'RR2Y', '1.9', 'R', 'Ac', '54.5', '7.6', '0', '119.5', '$818'],
['Hefty', 'H17Y12', 'RR2Y', '1.7', 'MR', 'I', '53.7', '7.7', '0', '124.4', '$806'],
['Dyna-Gro', 'S15RY53', 'RR2Y', '1.5', 'R', 'Ac', '53.6', '7.7', '0', '126.8', '$804'],
['LG Seeds', 'C2050R2', 'RR2Y', '2.1', 'R', 'Ac', '53.6', '7.7', '0', '123.9', '$804'],
['Titan Pro', '19M42', 'RR2Y', '1.9', 'R', 'CMB', '53.6', '7.7', '0', '121.0', '$804'],
['Stine', '19RA02 (2) §', 'RR2Y', '1.9', 'R', 'CMB', '53.4', '7.7', '0', '123.9', '$801'],
['Asgrow', 'AG1832 §', 'RR2Y', '1.8', 'MR', 'Ac,PV', '52.9', '7.7', '0', '122.0', '$794'],
['Prairie Brand', 'PB-1566R2', 'RR2Y', '1.5', 'R', 'CMB', '52.8', '7.7', '0', '122.9', '$792'],
['Channel', '1901R2', 'RR2Y', '1.9', 'R', 'Ac,PV', '52.8', '7.6', '0', '123.4', '$791'],
['Titan Pro', '20M1', 'RR2Y', '2.0', 'R', 'Am', '52.5', '7.5', '0', '124.4', '$788'],
['Kruger', 'K2-2002', 'RR2Y', '2.0', 'R', 'Ac,PV', '52.4', '7.9', '0', '125.4', '$786'],
['Channel', '1700R2', 'RR2Y', '1.7', 'R', 'Ac,PV', '52.3', '7.9', '0', '123.9', '$784'],
['Hefty', 'H16Y11', 'RR2Y', '1.6', 'MR', 'I', '51.4', '7.6', '0', '123.9', '$771'],
['Anderson', '162R2Y', 'RR2Y', '1.6', 'R', 'None', '51.3', '7.5', '0', '119.5', '$770'],
['Titan Pro', '15M22', 'RR2Y', '1.5', 'R', 'CMB', '51.3', '7.8', '0', '125.4', '$769'],
['Dairyland', 'DSR-1710R2Y', 'RR2Y', '1.7', 'R', 'CMB', '51.3', '7.7', '0', '122.0', '$769'],
['Hefty', 'H20R3', 'RR2Y', '2.0', 'MR', 'I', '50.5', '8.2', '0', '121.0', '$757'],
['Prairie Brand', 'PB 1743R2', 'RR2Y', '1.7', 'R', 'CMB', '50.2', '7.7', '0', '125.8', '$752'],
['Gold Country', '1741', 'RR2Y', '1.7', 'R', 'Ac', '50.1', '7.8', '0', '123.9', '$751'],
['Trelay', '20RR43', 'RR2Y', '2.0', 'R', 'Ac,Ex', '49.9', '7.6', '0', '127.8', '$749'],
['Hefty', 'H14R3', 'RR2Y', '1.4', 'MR', 'I', '49.7', '7.7', '0', '122.9', '$746'],
['Prairie Brand', 'PB-2099NRR2', 'RR2Y', '2.0', 'R', 'CMB', '49.6', '7.8', '0', '126.3', '$743'],
['Wensman', 'W 3174NR2', 'RR2Y', '1.7', 'R', 'Ac', '49.3', '7.6', '0', '122.5', '$740'],
['Kruger', 'K2 1602', 'RR2Y', '1.6', 'R', 'Ac,PV', '48.7', '7.6', '0', '125.4', '$731'],
['NK Brand', 'S18-C2 §', 'RR2Y', '1.8', 'R', 'CMB', '48.7', '7.7', '0', '126.8', '$731'],
['Kruger', 'K2 1902', 'RR2Y', '1.9', 'R', 'Ac,PV', '48.7', '7.5', '0', '124.4', '$730'],
['Prairie Brand', 'PB-1823R2', 'RR2Y', '1.8', 'R', 'None', '48.5', '7.6', '0', '121.0', '$727'],
['Gold Country', '1541', 'RR2Y', '1.5', 'R', 'Ac', '48.4', '7.6', '0', '110.4', '$726'],
['', '', '', '', '', 'Test Average =', '47.6', '7.7', '0', '122.9', '$713'],
['', '', '', '', '', 'LSD (0.10) =', '5.7', '0.3', 'ns', '37.8', '566.4']
]

Binary file not shown.

View File

@ -314,3 +314,11 @@ def test_version_generation_with_prerelease_revision():
generate_version(version, prerelease=prerelease, revision=revision)
== "0.7.3-alpha.2"
)
def test_stream_duplicated_text():
df = pd.DataFrame(data_stream_duplicated_text)
filename = os.path.join(testdir, "birdisland.pdf")
tables = camelot.read_pdf(filename, flavor="stream")
assert_frame_equal(df, tables[0].df)