diff --git a/camelot/utils.py b/camelot/utils.py index 3e8ab96..2126fbb 100644 --- a/camelot/utils.py +++ b/camelot/utils.py @@ -353,7 +353,7 @@ def text_in_bbox(bbox, text): Returns ------- t_bbox : list - List of PDFMiner text objects that lie inside table. + List of PDFMiner text objects that lie inside table, discarding the overlapping ones """ lb = (bbox[0], bbox[1]) @@ -364,7 +364,97 @@ def text_in_bbox(bbox, text): if lb[0] - 2 <= (t.x0 + t.x1) / 2.0 <= rt[0] + 2 and lb[1] - 2 <= (t.y0 + t.y1) / 2.0 <= rt[1] + 2 ] - return t_bbox + + # Avoid duplicate text by discarding overlapping boxes + rest = {t for t in t_bbox} + for ba in t_bbox: + for bb in rest.copy(): + if ba == bb: + continue + if bbox_intersect(ba, bb): + # if the intersection is larger than 80% of ba's size, we keep the longest + if (bbox_intersection_area(ba, bb) / bbox_area(ba)) > 0.8: + if bbox_longer(bb, ba): + rest.discard(ba) + unique_boxes = list(rest) + + return unique_boxes + + +def bbox_intersection_area(ba, bb) -> float: + """Returns area of the intersection of the bounding boxes of two PDFMiner objects. + + Parameters + ---------- + ba : PDFMiner text object + bb : PDFMiner text object + + Returns + ------- + intersection_area : float + Area of the intersection of the bounding boxes of both objects + + """ + x_left = max(ba.x0, bb.x0) + y_top = min(ba.y1, bb.y1) + x_right = min(ba.x1, bb.x1) + y_bottom = max(ba.y0, bb.y0) + + if x_right < x_left or y_bottom > y_top: + return 0.0 + + intersection_area = (x_right - x_left) * (y_top - y_bottom) + return intersection_area + + +def bbox_area(bb) -> float: + """Returns area of the bounding box of a PDFMiner object. + + Parameters + ---------- + bb : PDFMiner text object + + Returns + ------- + area : float + Area of the bounding box of the object + + """ + return (bb.x1 - bb.x0) * (bb.y1 - bb.y0) + + +def bbox_intersect(ba, bb) -> bool: + """Returns True if the bounding boxes of two PDFMiner objects intersect. + + Parameters + ---------- + ba : PDFMiner text object + bb : PDFMiner text object + + Returns + ------- + overlaps : bool + True if the bounding boxes intersect + + """ + return ba.x1 >= bb.x0 and bb.x1 >= ba.x0 and ba.y1 >= bb.y0 and bb.y1 >= ba.y0 + + +def bbox_longer(ba, bb) -> bool: + """Returns True if the bounding box of the first PDFMiner object is longer or equal to the second. + + Parameters + ---------- + ba : PDFMiner text object + bb : PDFMiner text object + + Returns + ------- + longer : bool + True if the bounding box of the first object is longer or equal + + """ + return (ba.x1 - ba.x0) >= (bb.x1 - bb.x0) def merge_close_lines(ar, line_tol=2): diff --git a/tests/data.py b/tests/data.py index 7e53792..b2bf706 100755 --- a/tests/data.py +++ b/tests/data.py @@ -2798,3 +2798,51 @@ data_stream_layout_kwargs = [ ["A.O.P Cornas", ""], ["Domaine Lionnet « Terre Brûlée » 2012", "15 €"], ] + +data_stream_duplicated_text = [ + ['', '2012 BETTER VARIETIES Harvest Report for Minnesota Central [ MNCE ]', '', '', '', '', '', '', '', '', + 'ALL SEASON TEST'], + ['', 'Doug Toreen, Renville County, MN 55310 [ BIRD ISLAND ]', '', '', '', '', '', '', '', '', + '1.3 - 2.0 MAT. GROUP'], + ['PREV. CROP/HERB:', 'Corn / Surpass, Roundup', '', '', '', '', '', '', '', '', 'S2MNCE01'], + ['SOIL DESCRIPTION:', '', 'Canisteo clay loam, mod. well drained, non-irrigated', '', '', '', '', '', '', '', ''], + ['SOIL CONDITIONS:', '', 'High P, high K, 6.7 pH, 3.9% OM, Low SCN', '', '', '', '', '', '', '', '30" ROW SPACING'], + ['TILLAGE/CULTIVATION:', 'conventional w/ fall till', '', '', '', '', '', '', '', '', ''], + ['PEST MANAGEMENT:', 'Roundup twice', '', '', '', '', '', '', '', '', ''], + ['SEEDED - RATE:', 'May 15', '140 000 /A', '', '', '', '', '', '', 'TOP 30 for YIELD of 63 TESTED', ''], + ['HARVESTED - STAND:', 'Oct 3', '122 921 /A', '', '', '', '', '', '', 'AVERAGE of (3) REPLICATIONS', ''], + ['', '', '', '', 'SCN', 'Seed', 'Yield', 'Moisture', 'Lodging', 'Stand', 'Gross'], + ['Company/Brand', 'Product/Brand†', 'Technol.†', 'Mat.', 'Resist.', 'Trmt.†', 'Bu/A', '%', '%', '(x 1000)', + 'Income'], ['Kruger', 'K2 1901', 'RR2Y', '1.9', 'R', 'Ac,PV', '56.4', '7.6', '0', '126.3', '$846'], + ['Stine', '19RA02 §', 'RR2Y', '1.9', 'R', 'CMB', '55.3', '7.6', '0', '120.0', '$830'], + ['Wensman', 'W 3190NR2', 'RR2Y', '1.9', 'R', 'Ac', '54.5', '7.6', '0', '119.5', '$818'], + ['Hefty', 'H17Y12', 'RR2Y', '1.7', 'MR', 'I', '53.7', '7.7', '0', '124.4', '$806'], + ['Dyna-Gro', 'S15RY53', 'RR2Y', '1.5', 'R', 'Ac', '53.6', '7.7', '0', '126.8', '$804'], + ['LG Seeds', 'C2050R2', 'RR2Y', '2.1', 'R', 'Ac', '53.6', '7.7', '0', '123.9', '$804'], + ['Titan Pro', '19M42', 'RR2Y', '1.9', 'R', 'CMB', '53.6', '7.7', '0', '121.0', '$804'], + ['Stine', '19RA02 (2) §', 'RR2Y', '1.9', 'R', 'CMB', '53.4', '7.7', '0', '123.9', '$801'], + ['Asgrow', 'AG1832 §', 'RR2Y', '1.8', 'MR', 'Ac,PV', '52.9', '7.7', '0', '122.0', '$794'], + ['Prairie Brand', 'PB-1566R2', 'RR2Y', '1.5', 'R', 'CMB', '52.8', '7.7', '0', '122.9', '$792'], + ['Channel', '1901R2', 'RR2Y', '1.9', 'R', 'Ac,PV', '52.8', '7.6', '0', '123.4', '$791'], + ['Titan Pro', '20M1', 'RR2Y', '2.0', 'R', 'Am', '52.5', '7.5', '0', '124.4', '$788'], + ['Kruger', 'K2-2002', 'RR2Y', '2.0', 'R', 'Ac,PV', '52.4', '7.9', '0', '125.4', '$786'], + ['Channel', '1700R2', 'RR2Y', '1.7', 'R', 'Ac,PV', '52.3', '7.9', '0', '123.9', '$784'], + ['Hefty', 'H16Y11', 'RR2Y', '1.6', 'MR', 'I', '51.4', '7.6', '0', '123.9', '$771'], + ['Anderson', '162R2Y', 'RR2Y', '1.6', 'R', 'None', '51.3', '7.5', '0', '119.5', '$770'], + ['Titan Pro', '15M22', 'RR2Y', '1.5', 'R', 'CMB', '51.3', '7.8', '0', '125.4', '$769'], + ['Dairyland', 'DSR-1710R2Y', 'RR2Y', '1.7', 'R', 'CMB', '51.3', '7.7', '0', '122.0', '$769'], + ['Hefty', 'H20R3', 'RR2Y', '2.0', 'MR', 'I', '50.5', '8.2', '0', '121.0', '$757'], + ['Prairie Brand', 'PB 1743R2', 'RR2Y', '1.7', 'R', 'CMB', '50.2', '7.7', '0', '125.8', '$752'], + ['Gold Country', '1741', 'RR2Y', '1.7', 'R', 'Ac', '50.1', '7.8', '0', '123.9', '$751'], + ['Trelay', '20RR43', 'RR2Y', '2.0', 'R', 'Ac,Ex', '49.9', '7.6', '0', '127.8', '$749'], + ['Hefty', 'H14R3', 'RR2Y', '1.4', 'MR', 'I', '49.7', '7.7', '0', '122.9', '$746'], + ['Prairie Brand', 'PB-2099NRR2', 'RR2Y', '2.0', 'R', 'CMB', '49.6', '7.8', '0', '126.3', '$743'], + ['Wensman', 'W 3174NR2', 'RR2Y', '1.7', 'R', 'Ac', '49.3', '7.6', '0', '122.5', '$740'], + ['Kruger', 'K2 1602', 'RR2Y', '1.6', 'R', 'Ac,PV', '48.7', '7.6', '0', '125.4', '$731'], + ['NK Brand', 'S18-C2 §', 'RR2Y', '1.8', 'R', 'CMB', '48.7', '7.7', '0', '126.8', '$731'], + ['Kruger', 'K2 1902', 'RR2Y', '1.9', 'R', 'Ac,PV', '48.7', '7.5', '0', '124.4', '$730'], + ['Prairie Brand', 'PB-1823R2', 'RR2Y', '1.8', 'R', 'None', '48.5', '7.6', '0', '121.0', '$727'], + ['Gold Country', '1541', 'RR2Y', '1.5', 'R', 'Ac', '48.4', '7.6', '0', '110.4', '$726'], + ['', '', '', '', '', 'Test Average =', '47.6', '7.7', '0', '122.9', '$713'], + ['', '', '', '', '', 'LSD (0.10) =', '5.7', '0.3', 'ns', '37.8', '566.4'] +] diff --git a/tests/files/birdisland.pdf b/tests/files/birdisland.pdf new file mode 100644 index 0000000..1501158 Binary files /dev/null and b/tests/files/birdisland.pdf differ diff --git a/tests/test_common.py b/tests/test_common.py index 6fadc9d..cb9a968 100644 --- a/tests/test_common.py +++ b/tests/test_common.py @@ -314,3 +314,11 @@ def test_version_generation_with_prerelease_revision(): generate_version(version, prerelease=prerelease, revision=revision) == "0.7.3-alpha.2" ) + + +def test_stream_duplicated_text(): + df = pd.DataFrame(data_stream_duplicated_text) + + filename = os.path.join(testdir, "birdisland.pdf") + tables = camelot.read_pdf(filename, flavor="stream") + assert_frame_equal(df, tables[0].df)