Improve column detection for hybrid flavor

No longer rely on the mode but on the parsing analysis during network
detection.
Added unit test for complex table with vertical header and mixed
horizontal / vertical text.
pull/153/head
Frh 2020-04-29 11:46:40 -07:00
parent e31e978ebe
commit ada4809a59
12 changed files with 666 additions and 135 deletions

View File

@ -6,7 +6,6 @@ from __future__ import division
import copy
import math
import numpy as np
import warnings
from .base import TextBaseParser
from ..core import (
@ -18,6 +17,7 @@ from ..core import (
from ..utils import (
bbox_from_str,
text_in_bbox,
textlines_overlapping_bbox,
bbox_from_textlines,
find_columns_coordinates,
text_in_bbox_per_axis,
@ -321,11 +321,17 @@ class TextNetworks(TextAlignments):
horizontal axis.
"""
# Find the textline with the highest alignment score
# Find the textline with the highest alignment score, with a tie break
# to prefer textlines further down in the table. Starting the search
# from the table's bottom allows the algo to collect data on more cells
# before going to the header, typically harder to parse.
return max(
self._textline_to_alignments.keys(),
key=lambda textline:
self._textline_to_alignments[textline].alignment_score(),
(
self._textline_to_alignments[textline].alignment_score(),
-textline.y0
),
default=None
)
@ -566,12 +572,13 @@ class Hybrid(TextBaseParser):
)
def _generate_table_bbox(self):
user_provided_bboxes = None
if self.table_areas is not None:
table_bbox = {}
# User gave us table areas already. We will use their coordinates
# to find column anchors.
user_provided_bboxes = []
for area_str in self.table_areas:
table_bbox[bbox_from_str(area_str)] = None
self.table_bbox = table_bbox
return
user_provided_bboxes.append(bbox_from_str(area_str))
# Take all the textlines that are not just spaces
all_textlines = [
@ -593,59 +600,73 @@ class Hybrid(TextBaseParser):
parse_details_bbox_searches = None
while True:
text_network = TextNetworks()
text_network.generate(textlines)
text_network._remove_unconnected_edges()
gaps_hv = text_network._compute_plausible_gaps()
if gaps_hv is None:
return None
# edge_tol instructions override the calculated vertical gap
edge_tol_hv = (
gaps_hv[0],
gaps_hv[1] if self.edge_tol is None else self.edge_tol
)
bbox = text_network._build_bbox_candidate(
edge_tol_hv,
parse_details=parse_details_bbox_searches
)
if bbox is None:
break
# Find a bbox: either pulling from the user's or from the network
# algorithm.
if parse_details_network_searches is not None:
# Preserve the current edge calculation for display debugging
parse_details_network_searches.append(
copy.deepcopy(text_network)
# First look for the body of the table
bbox_body = None
if user_provided_bboxes is not None:
if len(user_provided_bboxes) > 0:
bbox_body = user_provided_bboxes.pop()
else:
text_network = TextNetworks()
text_network.generate(textlines)
text_network._remove_unconnected_edges()
gaps_hv = text_network._compute_plausible_gaps()
if gaps_hv is None:
return None
# edge_tol instructions override the calculated vertical gap
edge_tol_hv = (
gaps_hv[0],
gaps_hv[1] if self.edge_tol is None else self.edge_tol
)
bbox_body = text_network._build_bbox_candidate(
edge_tol_hv,
parse_details=parse_details_bbox_searches
)
# Get all the textlines that are at least 50% in the box
tls_in_bbox = text_in_bbox(bbox, textlines)
if parse_details_network_searches is not None:
# Preserve the current edge calculation for debugging
parse_details_network_searches.append(
copy.deepcopy(text_network)
)
# and expand the text box to fully contain them
bbox = bbox_from_textlines(tls_in_bbox)
if bbox_body is None:
break
# FRH: do we need to repeat this?
# tls_in_bbox = text_in_bbox(bbox, textlines)
# Get all the textlines that overlap with the box, compute
# columns
tls_in_bbox = textlines_overlapping_bbox(bbox_body, textlines)
cols_anchors = find_columns_coordinates(tls_in_bbox)
# Apply a heuristic to salvage headers which formatting might be
# off compared to the rest of the table.
expanded_bbox = search_header_from_body_bbox(
bbox,
textlines,
cols_anchors,
gaps_hv[1]
)
# Unless the user gave us strict bbox_body, try to find a header
# above the body to build the full bbox.
if user_provided_bboxes is not None:
bbox_full = bbox_body
else:
# Expand the text box to fully contain the tls we found
bbox_body = bbox_from_textlines(tls_in_bbox)
# Apply a heuristic to salvage headers which formatting might
# be off compared to the rest of the table.
bbox_full = search_header_from_body_bbox(
bbox_body,
textlines,
cols_anchors,
gaps_hv[1]
)
table_parse = {
"bbox_body": bbox_body,
"cols_anchors": cols_anchors,
"bbox_full": bbox_full
}
self.table_bbox[bbox_full] = table_parse
if self.parse_details is not None:
if "col_searches" not in self.parse_details:
self.parse_details["col_searches"] = []
self.parse_details["col_searches"].append({
"core_bbox": bbox,
"cols_anchors": cols_anchors,
"expanded_bbox": expanded_bbox
})
self.table_bbox[expanded_bbox] = None
self.parse_details["col_searches"].append(table_parse)
# Remember what textlines we processed, and repeat
for tl in tls_in_bbox:
@ -682,7 +703,6 @@ class Hybrid(TextBaseParser):
# the alignment identification work we've done earlier.
rows_grouped = self._group_rows(all_tls, row_tol=self.row_tol)
rows = self._join_rows(rows_grouped, text_y_max, text_y_min)
elements = [len(r) for r in rows_grouped]
if self.columns is not None and self.columns[table_idx] != "":
# user has to input boundary columns too
@ -695,53 +715,11 @@ class Hybrid(TextBaseParser):
cols.append(text_x_max)
cols = [(cols[i], cols[i + 1]) for i in range(0, len(cols) - 1)]
else:
# calculate mode of the list of number of elements in
# each row to guess the number of columns
ncols = max(set(elements), key=elements.count)
if ncols == 1:
# if mode is 1, the page usually contains not tables
# but there can be cases where the list can be skewed,
# try to remove all 1s from list in this case and
# see if the list contains elements, if yes, then use
# the mode after removing 1s
elements = list(filter(lambda x: x != 1, elements))
if elements:
ncols = max(set(elements), key=elements.count)
else:
warnings.warn(
"No tables found in table area {}"
.format(table_idx + 1)
)
cols = [
(t.x0, t.x1)
for r in rows_grouped
if len(r) == ncols
for t in r
]
cols = self._merge_columns(
sorted(cols),
column_tol=self.column_tol
)
inner_text = []
for i in range(1, len(cols)):
left = cols[i - 1][1]
right = cols[i][0]
inner_text.extend(
[
t
for direction in self.t_bbox
for t in self.t_bbox[direction]
if t.x0 > left and t.x1 < right
]
)
outer_text = [
t
for direction in self.t_bbox
for t in self.t_bbox[direction]
if t.x0 > cols[-1][1] or t.x1 < cols[0][0]
]
inner_text.extend(outer_text)
cols = self._add_columns(cols, inner_text, self.row_tol)
cols = self._join_columns(cols, text_x_min, text_x_max)
parse_details = self.table_bbox[bbox]
col_anchors = parse_details["cols_anchors"]
cols = list(map(
lambda idx: [col_anchors[idx], col_anchors[idx + 1]],
range(0, len(col_anchors) - 1)
))
return cols, rows, None, None

View File

@ -472,7 +472,7 @@ class PlotMethods():
for box_id, col_search in enumerate(parse_details["col_searches"]):
draw_labeled_bbox(
ax, col_search["expanded_bbox"],
ax, col_search["bbox_full"],
"box body + header #{box_id}".format(
box_id=box_id
),
@ -481,7 +481,7 @@ class PlotMethods():
label_pos="top,left"
)
draw_labeled_bbox(
ax, col_search["core_bbox"],
ax, col_search["bbox_body"],
"box body #{box_id}".format(
box_id=box_id
),
@ -495,8 +495,8 @@ class PlotMethods():
ax.plot(
[col_anchor, col_anchor],
[
col_search["core_bbox"][1] - 10,
col_search["core_bbox"][3] + 10,
col_search["bbox_body"][1] - 10,
col_search["bbox_body"][3] + 10,
],
color="green"
)

View File

@ -431,8 +431,36 @@ def bbox_from_str(bbox_str):
)
def textlines_overlapping_bbox(bbox, textlines):
"""Returns all text objects which overlap or are within a bounding box.
Parameters
----------
bbox : tuple
Tuple (x1, y1, x2, y2) representing a bounding box where
(x1, y1) -> lb and (x2, y2) -> rt in the PDF coordinate
space.
textlines : List of PDFMiner text objects.
Returns
-------
t_bbox : list
List of PDFMiner text objects.
"""
(left, bottom, right, top) = bbox
t_bbox = [
t
for t in textlines
if ((left < t.x0 < right) or (left < t.x1 < right))
and ((bottom < t.y0 < top) or (bottom < t.y1 < top))
]
return t_bbox
def text_in_bbox(bbox, text):
"""Returns all text objects which lie at least 50% inside a bounding box.
"""Returns all text objects which lie at least 50% inside a bounding box
across both dimensions.
Parameters
----------

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@ -1629,6 +1629,453 @@ data_hybrid_two_tables_b_2 = [
# Trimming the table for the test of hybrid, which doesn't include it.
data_hybrid_two_tables_2 = data_stream_two_tables_2[3:-1]
data_hybrid_vertical_headers = [
[
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"Congress-",
"Senator 36th",
"Rep106th",
"",
"Reg. of",
"Road",
"",
"",
"",
"Distri",
"Dist",
"",
"",
],
[
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"1st Dist",
"Dist.",
"Dist.",
"",
"Deeds",
"",
"Commission",
"",
"District #1",
"ct #2",
"#3",
"",
"Dist #4",
],
[
"",
"",
"",
"",
"",
"Governor",
"",
"",
"U.S. Senator",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
],
[
"",
"Number of Registered voters",
"Poll Book Totals",
"Brian Calley",
"Patrick Colbeck",
"Jim Hines",
"Bill Schuette",
"John James",
"Sandy Pensler",
"",
"Jack Bergman",
"",
"Jim Stamas",
"Sue Allor",
"Melissa A. Cordes",
"",
"Al Scully",
"",
"Daniel G. Gauthier",
"Craig M. Clemens",
"Craig Johnston",
"Carolyn Brummund",
"Adam Brege",
"David Bielusiak",
],
[
"Alcona",
"963",
"439",
"55",
"26",
"47",
"164",
"173",
"111",
"",
"268",
"",
"272",
"275",
"269",
"",
"271",
"",
"224",
"76",
"",
"",
"",
"",
],
[
"Caledonia",
"923",
"393",
"40",
"23",
"45",
"158",
"150",
"103",
"",
"244",
"",
"247",
"254",
"255",
"",
"244",
"",
"139",
"143",
"",
"",
"",
"",
],
[
"Curtis",
"1026",
"349",
"30",
"30",
"25",
"102",
"95",
"84",
"",
"159",
"",
"164",
"162",
"161",
"",
"157",
"",
"",
"",
"",
"",
"",
"",
],
[
"Greenbush",
"1212",
"423",
"56",
"26",
"40",
"126",
"104",
"131",
"",
"208",
"",
"213",
"214",
"215",
"",
"208",
"",
"",
"",
"",
"208",
"",
"",
],
[
"Gustin",
"611",
"180",
"22",
"35",
"17",
"55",
"73",
"45",
"",
"108",
"",
"104",
"111",
"111",
"",
"109",
"",
"",
"",
"",
"",
"81",
"42",
],
[
"Harrisville",
"1142",
"430",
"45",
"90",
"29",
"101",
"155",
"94",
"",
"226",
"",
"226",
"232",
"244",
"",
"226",
"",
"",
"",
"232",
"",
"",
"",
],
[
"Hawes",
"884",
"293",
"38",
"36",
"27",
"109",
"121",
"84",
"",
"192",
"",
"195",
"195",
"193",
"",
"184",
"",
"",
"",
"",
"",
"118",
"87",
],
[
"Haynes",
"626",
"275",
"31",
"20",
"32",
"104",
"121",
"53",
"",
"163",
"",
"163",
"173",
"161",
"",
"152",
"",
"",
"",
"76",
"",
"69",
"31",
],
[
"Mikado",
"781",
"208",
"19",
"39",
"17",
"81",
"90",
"63",
"",
"149",
"",
"149",
"145",
"147",
"",
"143",
"",
"",
"",
"",
"113",
"",
"",
],
[
"Millen",
"353",
"139",
"7",
"16",
"13",
"38",
"49",
"19",
"",
"62",
"",
"66",
"67",
"66",
"",
"62",
"",
"",
"",
"",
"",
"",
"",
],
[
"Mitchell",
"327",
"96",
"12",
"17",
"7",
"29",
"41",
"17",
"",
"57",
"",
"55",
"57",
"60",
"",
"56",
"",
"",
"",
"",
"",
"",
"",
],
[
"City Harrisville",
"389",
"171",
"16",
"15",
"18",
"35",
"49",
"31",
"",
"78",
"",
"80",
"82",
"81",
"",
"77",
"",
"",
"",
"73",
"",
"",
"",
],
[
"Totals",
"9237",
"3396",
"371",
"373",
"317",
"1102",
"1221",
"835",
"0",
"1914",
"0",
"1934",
"1967",
"1963",
"0",
"1889",
"0",
"363",
"219",
"381",
"321",
"268",
"160",
],
]
data_stream_table_areas = [
["", "One Withholding"],
["Payroll Period", "Allowance"],

Binary file not shown.

Before

Width:  |  Height:  |  Size: 103 KiB

After

Width:  |  Height:  |  Size: 103 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 48 KiB

After

Width:  |  Height:  |  Size: 48 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 90 KiB

After

Width:  |  Height:  |  Size: 90 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 101 KiB

After

Width:  |  Height:  |  Size: 101 KiB

Binary file not shown.

View File

@ -200,6 +200,17 @@ def test_hybrid_two_tables_b():
assert df2.equals(tables[1].df)
def test_hybrid_vertical_header():
"""Tests a complex table with a vertically text header.
"""
df = pd.DataFrame(data_hybrid_vertical_headers)
filename = os.path.join(testdir, "vertical_header.pdf")
tables = camelot.read_pdf(filename, flavor="hybrid")
assert len(tables) == 1
assert_frame_equal(df, tables[0].df)
def test_hybrid_table_regions():
df = pd.DataFrame(data_hybrid_table_regions)