Improve column detection for hybrid flavor

No longer rely on the mode but on the parsing analysis during network
detection.
Added unit test for complex table with vertical header and mixed
horizontal / vertical text.
pull/153/head
Frh 2020-04-29 11:46:40 -07:00
parent 04fc542dc3
commit c0903b8ca9
12 changed files with 666 additions and 135 deletions

View File

@ -6,7 +6,6 @@ from __future__ import division
import copy import copy
import math import math
import numpy as np import numpy as np
import warnings
from .base import TextBaseParser from .base import TextBaseParser
from ..core import ( from ..core import (
@ -18,6 +17,7 @@ from ..core import (
from ..utils import ( from ..utils import (
bbox_from_str, bbox_from_str,
text_in_bbox, text_in_bbox,
textlines_overlapping_bbox,
bbox_from_textlines, bbox_from_textlines,
find_columns_coordinates, find_columns_coordinates,
text_in_bbox_per_axis, text_in_bbox_per_axis,
@ -321,11 +321,17 @@ class TextNetworks(TextAlignments):
horizontal axis. horizontal axis.
""" """
# Find the textline with the highest alignment score # Find the textline with the highest alignment score, with a tie break
# to prefer textlines further down in the table. Starting the search
# from the table's bottom allows the algo to collect data on more cells
# before going to the header, typically harder to parse.
return max( return max(
self._textline_to_alignments.keys(), self._textline_to_alignments.keys(),
key=lambda textline: key=lambda textline:
(
self._textline_to_alignments[textline].alignment_score(), self._textline_to_alignments[textline].alignment_score(),
-textline.y0
),
default=None default=None
) )
@ -566,12 +572,13 @@ class Hybrid(TextBaseParser):
) )
def _generate_table_bbox(self): def _generate_table_bbox(self):
user_provided_bboxes = None
if self.table_areas is not None: if self.table_areas is not None:
table_bbox = {} # User gave us table areas already. We will use their coordinates
# to find column anchors.
user_provided_bboxes = []
for area_str in self.table_areas: for area_str in self.table_areas:
table_bbox[bbox_from_str(area_str)] = None user_provided_bboxes.append(bbox_from_str(area_str))
self.table_bbox = table_bbox
return
# Take all the textlines that are not just spaces # Take all the textlines that are not just spaces
all_textlines = [ all_textlines = [
@ -593,6 +600,15 @@ class Hybrid(TextBaseParser):
parse_details_bbox_searches = None parse_details_bbox_searches = None
while True: while True:
# Find a bbox: either pulling from the user's or from the network
# algorithm.
# First look for the body of the table
bbox_body = None
if user_provided_bboxes is not None:
if len(user_provided_bboxes) > 0:
bbox_body = user_provided_bboxes.pop()
else:
text_network = TextNetworks() text_network = TextNetworks()
text_network.generate(textlines) text_network.generate(textlines)
text_network._remove_unconnected_edges() text_network._remove_unconnected_edges()
@ -604,48 +620,53 @@ class Hybrid(TextBaseParser):
gaps_hv[0], gaps_hv[0],
gaps_hv[1] if self.edge_tol is None else self.edge_tol gaps_hv[1] if self.edge_tol is None else self.edge_tol
) )
bbox = text_network._build_bbox_candidate( bbox_body = text_network._build_bbox_candidate(
edge_tol_hv, edge_tol_hv,
parse_details=parse_details_bbox_searches parse_details=parse_details_bbox_searches
) )
if bbox is None:
break
if parse_details_network_searches is not None: if parse_details_network_searches is not None:
# Preserve the current edge calculation for display debugging # Preserve the current edge calculation for debugging
parse_details_network_searches.append( parse_details_network_searches.append(
copy.deepcopy(text_network) copy.deepcopy(text_network)
) )
# Get all the textlines that are at least 50% in the box if bbox_body is None:
tls_in_bbox = text_in_bbox(bbox, textlines) break
# and expand the text box to fully contain them # Get all the textlines that overlap with the box, compute
bbox = bbox_from_textlines(tls_in_bbox) # columns
tls_in_bbox = textlines_overlapping_bbox(bbox_body, textlines)
# FRH: do we need to repeat this?
# tls_in_bbox = text_in_bbox(bbox, textlines)
cols_anchors = find_columns_coordinates(tls_in_bbox) cols_anchors = find_columns_coordinates(tls_in_bbox)
# Apply a heuristic to salvage headers which formatting might be # Unless the user gave us strict bbox_body, try to find a header
# off compared to the rest of the table. # above the body to build the full bbox.
expanded_bbox = search_header_from_body_bbox( if user_provided_bboxes is not None:
bbox, bbox_full = bbox_body
else:
# Expand the text box to fully contain the tls we found
bbox_body = bbox_from_textlines(tls_in_bbox)
# Apply a heuristic to salvage headers which formatting might
# be off compared to the rest of the table.
bbox_full = search_header_from_body_bbox(
bbox_body,
textlines, textlines,
cols_anchors, cols_anchors,
gaps_hv[1] gaps_hv[1]
) )
table_parse = {
"bbox_body": bbox_body,
"cols_anchors": cols_anchors,
"bbox_full": bbox_full
}
self.table_bbox[bbox_full] = table_parse
if self.parse_details is not None: if self.parse_details is not None:
if "col_searches" not in self.parse_details: if "col_searches" not in self.parse_details:
self.parse_details["col_searches"] = [] self.parse_details["col_searches"] = []
self.parse_details["col_searches"].append({ self.parse_details["col_searches"].append(table_parse)
"core_bbox": bbox,
"cols_anchors": cols_anchors,
"expanded_bbox": expanded_bbox
})
self.table_bbox[expanded_bbox] = None
# Remember what textlines we processed, and repeat # Remember what textlines we processed, and repeat
for tl in tls_in_bbox: for tl in tls_in_bbox:
@ -682,7 +703,6 @@ class Hybrid(TextBaseParser):
# the alignment identification work we've done earlier. # the alignment identification work we've done earlier.
rows_grouped = self._group_rows(all_tls, row_tol=self.row_tol) rows_grouped = self._group_rows(all_tls, row_tol=self.row_tol)
rows = self._join_rows(rows_grouped, text_y_max, text_y_min) rows = self._join_rows(rows_grouped, text_y_max, text_y_min)
elements = [len(r) for r in rows_grouped]
if self.columns is not None and self.columns[table_idx] != "": if self.columns is not None and self.columns[table_idx] != "":
# user has to input boundary columns too # user has to input boundary columns too
@ -695,53 +715,11 @@ class Hybrid(TextBaseParser):
cols.append(text_x_max) cols.append(text_x_max)
cols = [(cols[i], cols[i + 1]) for i in range(0, len(cols) - 1)] cols = [(cols[i], cols[i + 1]) for i in range(0, len(cols) - 1)]
else: else:
# calculate mode of the list of number of elements in parse_details = self.table_bbox[bbox]
# each row to guess the number of columns col_anchors = parse_details["cols_anchors"]
ncols = max(set(elements), key=elements.count) cols = list(map(
if ncols == 1: lambda idx: [col_anchors[idx], col_anchors[idx + 1]],
# if mode is 1, the page usually contains not tables range(0, len(col_anchors) - 1)
# but there can be cases where the list can be skewed, ))
# try to remove all 1s from list in this case and
# see if the list contains elements, if yes, then use
# the mode after removing 1s
elements = list(filter(lambda x: x != 1, elements))
if elements:
ncols = max(set(elements), key=elements.count)
else:
warnings.warn(
"No tables found in table area {}"
.format(table_idx + 1)
)
cols = [
(t.x0, t.x1)
for r in rows_grouped
if len(r) == ncols
for t in r
]
cols = self._merge_columns(
sorted(cols),
column_tol=self.column_tol
)
inner_text = []
for i in range(1, len(cols)):
left = cols[i - 1][1]
right = cols[i][0]
inner_text.extend(
[
t
for direction in self.t_bbox
for t in self.t_bbox[direction]
if t.x0 > left and t.x1 < right
]
)
outer_text = [
t
for direction in self.t_bbox
for t in self.t_bbox[direction]
if t.x0 > cols[-1][1] or t.x1 < cols[0][0]
]
inner_text.extend(outer_text)
cols = self._add_columns(cols, inner_text, self.row_tol)
cols = self._join_columns(cols, text_x_min, text_x_max)
return cols, rows, None, None return cols, rows, None, None

View File

@ -472,7 +472,7 @@ class PlotMethods():
for box_id, col_search in enumerate(parse_details["col_searches"]): for box_id, col_search in enumerate(parse_details["col_searches"]):
draw_labeled_bbox( draw_labeled_bbox(
ax, col_search["expanded_bbox"], ax, col_search["bbox_full"],
"box body + header #{box_id}".format( "box body + header #{box_id}".format(
box_id=box_id box_id=box_id
), ),
@ -481,7 +481,7 @@ class PlotMethods():
label_pos="top,left" label_pos="top,left"
) )
draw_labeled_bbox( draw_labeled_bbox(
ax, col_search["core_bbox"], ax, col_search["bbox_body"],
"box body #{box_id}".format( "box body #{box_id}".format(
box_id=box_id box_id=box_id
), ),
@ -495,8 +495,8 @@ class PlotMethods():
ax.plot( ax.plot(
[col_anchor, col_anchor], [col_anchor, col_anchor],
[ [
col_search["core_bbox"][1] - 10, col_search["bbox_body"][1] - 10,
col_search["core_bbox"][3] + 10, col_search["bbox_body"][3] + 10,
], ],
color="green" color="green"
) )

View File

@ -431,8 +431,36 @@ def bbox_from_str(bbox_str):
) )
def textlines_overlapping_bbox(bbox, textlines):
"""Returns all text objects which overlap or are within a bounding box.
Parameters
----------
bbox : tuple
Tuple (x1, y1, x2, y2) representing a bounding box where
(x1, y1) -> lb and (x2, y2) -> rt in the PDF coordinate
space.
textlines : List of PDFMiner text objects.
Returns
-------
t_bbox : list
List of PDFMiner text objects.
"""
(left, bottom, right, top) = bbox
t_bbox = [
t
for t in textlines
if ((left < t.x0 < right) or (left < t.x1 < right))
and ((bottom < t.y0 < top) or (bottom < t.y1 < top))
]
return t_bbox
def text_in_bbox(bbox, text): def text_in_bbox(bbox, text):
"""Returns all text objects which lie at least 50% inside a bounding box. """Returns all text objects which lie at least 50% inside a bounding box
across both dimensions.
Parameters Parameters
---------- ----------

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@ -1629,6 +1629,453 @@ data_hybrid_two_tables_b_2 = [
# Trimming the table for the test of hybrid, which doesn't include it. # Trimming the table for the test of hybrid, which doesn't include it.
data_hybrid_two_tables_2 = data_stream_two_tables_2[3:-1] data_hybrid_two_tables_2 = data_stream_two_tables_2[3:-1]
data_hybrid_vertical_headers = [
[
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"Congress-",
"Senator 36th",
"Rep106th",
"",
"Reg. of",
"Road",
"",
"",
"",
"Distri",
"Dist",
"",
"",
],
[
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"1st Dist",
"Dist.",
"Dist.",
"",
"Deeds",
"",
"Commission",
"",
"District #1",
"ct #2",
"#3",
"",
"Dist #4",
],
[
"",
"",
"",
"",
"",
"Governor",
"",
"",
"U.S. Senator",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
],
[
"",
"Number of Registered voters",
"Poll Book Totals",
"Brian Calley",
"Patrick Colbeck",
"Jim Hines",
"Bill Schuette",
"John James",
"Sandy Pensler",
"",
"Jack Bergman",
"",
"Jim Stamas",
"Sue Allor",
"Melissa A. Cordes",
"",
"Al Scully",
"",
"Daniel G. Gauthier",
"Craig M. Clemens",
"Craig Johnston",
"Carolyn Brummund",
"Adam Brege",
"David Bielusiak",
],
[
"Alcona",
"963",
"439",
"55",
"26",
"47",
"164",
"173",
"111",
"",
"268",
"",
"272",
"275",
"269",
"",
"271",
"",
"224",
"76",
"",
"",
"",
"",
],
[
"Caledonia",
"923",
"393",
"40",
"23",
"45",
"158",
"150",
"103",
"",
"244",
"",
"247",
"254",
"255",
"",
"244",
"",
"139",
"143",
"",
"",
"",
"",
],
[
"Curtis",
"1026",
"349",
"30",
"30",
"25",
"102",
"95",
"84",
"",
"159",
"",
"164",
"162",
"161",
"",
"157",
"",
"",
"",
"",
"",
"",
"",
],
[
"Greenbush",
"1212",
"423",
"56",
"26",
"40",
"126",
"104",
"131",
"",
"208",
"",
"213",
"214",
"215",
"",
"208",
"",
"",
"",
"",
"208",
"",
"",
],
[
"Gustin",
"611",
"180",
"22",
"35",
"17",
"55",
"73",
"45",
"",
"108",
"",
"104",
"111",
"111",
"",
"109",
"",
"",
"",
"",
"",
"81",
"42",
],
[
"Harrisville",
"1142",
"430",
"45",
"90",
"29",
"101",
"155",
"94",
"",
"226",
"",
"226",
"232",
"244",
"",
"226",
"",
"",
"",
"232",
"",
"",
"",
],
[
"Hawes",
"884",
"293",
"38",
"36",
"27",
"109",
"121",
"84",
"",
"192",
"",
"195",
"195",
"193",
"",
"184",
"",
"",
"",
"",
"",
"118",
"87",
],
[
"Haynes",
"626",
"275",
"31",
"20",
"32",
"104",
"121",
"53",
"",
"163",
"",
"163",
"173",
"161",
"",
"152",
"",
"",
"",
"76",
"",
"69",
"31",
],
[
"Mikado",
"781",
"208",
"19",
"39",
"17",
"81",
"90",
"63",
"",
"149",
"",
"149",
"145",
"147",
"",
"143",
"",
"",
"",
"",
"113",
"",
"",
],
[
"Millen",
"353",
"139",
"7",
"16",
"13",
"38",
"49",
"19",
"",
"62",
"",
"66",
"67",
"66",
"",
"62",
"",
"",
"",
"",
"",
"",
"",
],
[
"Mitchell",
"327",
"96",
"12",
"17",
"7",
"29",
"41",
"17",
"",
"57",
"",
"55",
"57",
"60",
"",
"56",
"",
"",
"",
"",
"",
"",
"",
],
[
"City Harrisville",
"389",
"171",
"16",
"15",
"18",
"35",
"49",
"31",
"",
"78",
"",
"80",
"82",
"81",
"",
"77",
"",
"",
"",
"73",
"",
"",
"",
],
[
"Totals",
"9237",
"3396",
"371",
"373",
"317",
"1102",
"1221",
"835",
"0",
"1914",
"0",
"1934",
"1967",
"1963",
"0",
"1889",
"0",
"363",
"219",
"381",
"321",
"268",
"160",
],
]
data_stream_table_areas = [ data_stream_table_areas = [
["", "One Withholding"], ["", "One Withholding"],
["Payroll Period", "Allowance"], ["Payroll Period", "Allowance"],

Binary file not shown.

Before

Width:  |  Height:  |  Size: 103 KiB

After

Width:  |  Height:  |  Size: 103 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 48 KiB

After

Width:  |  Height:  |  Size: 48 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 90 KiB

After

Width:  |  Height:  |  Size: 90 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 101 KiB

After

Width:  |  Height:  |  Size: 101 KiB

Binary file not shown.

View File

@ -194,6 +194,17 @@ def test_hybrid_two_tables_b():
assert df2.equals(tables[1].df) assert df2.equals(tables[1].df)
def test_hybrid_vertical_header():
"""Tests a complex table with a vertically text header.
"""
df = pd.DataFrame(data_hybrid_vertical_headers)
filename = os.path.join(testdir, "vertical_header.pdf")
tables = camelot.read_pdf(filename, flavor="hybrid")
assert len(tables) == 1
assert_frame_equal(df, tables[0].df)
def test_hybrid_table_regions(): def test_hybrid_table_regions():
df = pd.DataFrame(data_hybrid_table_regions) df = pd.DataFrame(data_hybrid_table_regions)