WIP: Introduce actual hybrid parser

Create hybrid parser leverage both lattice and network techniques.
Simplify plotting of pdf in lattice case.
Rename "parser.table_bbox" into "parser.table_bbox_parses", since it
represents not a bbox but a dict of bbox to corresponding parsing data.

Still missing: more unit tests, plotting of steps.
pull/153/head
Frh 2020-05-04 16:27:01 -07:00
parent edad1efd1b
commit 4a761611bf
17 changed files with 1011 additions and 217 deletions

View File

@ -396,7 +396,8 @@ def network(c, *args, **kwargs):
"Please specify output file format using --format")
tables = read_pdf(
filepath, pages=pages, flavor="network", suppress_stdout=quiet, **kwargs
filepath, pages=pages, flavor="network",
suppress_stdout=quiet, **kwargs
)
click.echo("Found {} tables".format(tables.n))
if plot_type is not None:

View File

@ -454,7 +454,9 @@ class Table():
self.page = None
self.flavor = None # Flavor of the parser used
self.pdf_size = None # Dimensions of the original PDF page
self.parse_details = None # Field holding debug data
self._bbox = None # Bounding box in original document
self.parse = None # Parse information
self.parse_details = None # Field holding extra debug data
self._image = None
self._image_path = None # Temporary file to hold an image of the pdf

View File

@ -7,7 +7,7 @@ import logging
from PyPDF2 import PdfFileReader, PdfFileWriter
from .core import TableList
from .parsers import Stream, Lattice, Network
from .parsers import Stream, Lattice, Network, Hybrid
from .utils import (
build_file_path_in_temp_dir,
get_page_layout,
@ -23,6 +23,7 @@ PARSERS = {
"lattice": Lattice,
"stream": Stream,
"network": Network,
"hybrid": Hybrid,
}
@ -177,7 +178,8 @@ class PDFHandler():
Parameters
----------
flavor : str (default: 'lattice')
The parsing method to use ('lattice', 'stream', or 'network').
The parsing method to use ('lattice', 'stream', 'network',
or 'hybrid').
Lattice is used by default.
suppress_stdout : str (default: False)
Suppress logs and warnings.

View File

@ -6,7 +6,9 @@ import cv2
import numpy as np
def adaptive_threshold(imagename, process_background=False, blocksize=15, c=-2):
def adaptive_threshold(
imagename, process_background=False,
blocksize=15, c=-2):
"""Thresholds an image using OpenCV's adaptiveThreshold.
Parameters
@ -19,12 +21,12 @@ def adaptive_threshold(imagename, process_background=False, blocksize=15, c=-2):
Size of a pixel neighborhood that is used to calculate a
threshold value for the pixel: 3, 5, 7, and so on.
For more information, refer `OpenCV's adaptiveThreshold <https://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html#adaptivethreshold>`_.
For more information, refer `OpenCV's adaptiveThreshold <https://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html#adaptivethreshold>`_. # noqa
c : int, optional (default: -2)
Constant subtracted from the mean or weighted mean.
Normally, it is positive but may be zero or negative as well.
For more information, refer `OpenCV's adaptiveThreshold <https://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html#adaptivethreshold>`_.
For more information, refer `OpenCV's adaptiveThreshold <https://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html#adaptivethreshold>`_. # noqa
Returns
-------
@ -39,7 +41,10 @@ def adaptive_threshold(imagename, process_background=False, blocksize=15, c=-2):
if process_background:
threshold = cv2.adaptiveThreshold(
gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, blocksize, c
gray,
255,
cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
cv2.THRESH_BINARY, blocksize, c
)
else:
threshold = cv2.adaptiveThreshold(
@ -54,7 +59,8 @@ def adaptive_threshold(imagename, process_background=False, blocksize=15, c=-2):
def find_lines(
threshold, regions=None, direction="horizontal", line_scale=15, iterations=0
threshold, regions=None,
direction="horizontal", line_scale=15, iterations=0
):
"""Finds horizontal and vertical lines by applying morphological
transformations on an image.
@ -78,7 +84,7 @@ def find_lines(
iterations : int, optional (default: 0)
Number of times for erosion/dilation is applied.
For more information, refer `OpenCV's dilate <https://docs.opencv.org/2.4/modules/imgproc/doc/filtering.html#dilate>`_.
For more information, refer `OpenCV's dilate <https://docs.opencv.org/2.4/modules/imgproc/doc/filtering.html#dilate>`_. # noqa
Returns
-------
@ -100,13 +106,15 @@ def find_lines(
size = threshold.shape[1] // line_scale
el = cv2.getStructuringElement(cv2.MORPH_RECT, (size, 1))
elif direction is None:
raise ValueError("Specify direction as either 'vertical' or 'horizontal'")
raise ValueError(
"Specify direction as either 'vertical' or 'horizontal'"
)
if regions is not None:
region_mask = np.zeros(threshold.shape)
for region in regions:
x, y, w, h = region
region_mask[y : y + h, x : x + w] = 1
region_mask[y:y + h, x:x + w] = 1
threshold = np.multiply(threshold, region_mask)
threshold = cv2.erode(threshold, el)
@ -115,12 +123,14 @@ def find_lines(
try:
_, contours, _ = cv2.findContours(
threshold.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE
threshold.astype(np.uint8), cv2.RETR_EXTERNAL,
cv2.CHAIN_APPROX_SIMPLE
)
except ValueError:
# for opencv backward compatibility
contours, _ = cv2.findContours(
threshold.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE
threshold.astype(np.uint8), cv2.RETR_EXTERNAL,
cv2.CHAIN_APPROX_SIMPLE
)
for c in contours:
@ -202,7 +212,7 @@ def find_joints(contours, vertical, horizontal):
tables = {}
for c in contours:
x, y, w, h = c
roi = joints[y : y + h, x : x + w]
roi = joints[y:y + h, x:x + w]
try:
__, jc, __ = cv2.findContours(
roi.astype(np.uint8), cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE

View File

@ -99,7 +99,7 @@ def read_pdf(
"""
layout_kwargs = layout_kwargs or {}
if flavor not in ["lattice", "stream", "network"]:
if flavor not in ["lattice", "stream", "network", "hybrid"]:
raise NotImplementedError(
"Unknown flavor specified."
" Use either 'lattice', 'stream', or 'network'"

View File

@ -3,3 +3,4 @@
from .stream import Stream
from .lattice import Lattice
from .network import Network
from .hybrid import Hybrid

View File

@ -34,8 +34,9 @@ class BaseParser():
self.id = parser_id
self.table_regions = table_regions
self.table_areas = table_areas
self.table_bbox = {}
self.table_bbox_parses = {}
self.columns = None
self.copy_text = copy_text
self.split_text = split_text
self.strip_text = strip_text
@ -47,10 +48,18 @@ class BaseParser():
self.t_bbox = None
# For plotting details of parsing algorithms
self.resolution = 300 # default plotting resolution of the PDF.
self.parse_details = {}
if not debug:
self.parse_details = None
def table_bboxes(self):
return sorted(
self.table_bbox_parses.keys(),
key=lambda x: x[1],
reverse=True
)
def prepare_page_parse(self, filename, layout, dimensions,
page_idx, layout_kwargs):
self.filename = filename
@ -142,6 +151,7 @@ class BaseParser():
table = Table(cols, rows)
table.page = self.page
table.order = table_idx + 1
table._bbox = self.table_bboxes()[table_idx]
return table
@staticmethod
@ -177,7 +187,7 @@ class BaseParser():
table.cells[r_idx][c_idx].text = text
return pos_errors
def _generate_columns_and_rows(self, bbox, table_idx):
def _generate_columns_and_rows(self, bbox, user_cols):
# Pure virtual, must be defined by the derived parser
raise NotImplementedError()
@ -199,20 +209,23 @@ class BaseParser():
_tables = []
# sort tables based on y-coord
for table_idx, bbox in enumerate(
sorted(
self.table_bbox.keys(),
key=lambda x: x[1],
reverse=True
)
):
for table_idx, bbox in enumerate(self.table_bboxes()):
if self.columns is not None and self.columns[table_idx] != "":
# user has to input boundary columns too
# take (0, pdf_width) by default
# similar to else condition
# len can't be 1
user_cols = self.columns[table_idx].split(",")
user_cols = [float(c) for c in user_cols]
else:
user_cols = None
cols, rows, v_s, h_s = self._generate_columns_and_rows(
bbox,
table_idx
user_cols
)
table = self._generate_table(
table_idx, cols, rows, v_s=v_s, h_s=h_s)
table._bbox = bbox
_tables.append(table)
return _tables
@ -222,6 +235,7 @@ class BaseParser():
"""
table.flavor = self.id
table.filename = self.filename
table.parse = self.table_bbox_parses[table._bbox]
table.parse_details = self.parse_details
pos_errors = self.compute_parse_errors(table)
table.accuracy = compute_accuracy([[100, pos_errors]])
@ -453,17 +467,16 @@ class TextBaseParser(BaseParser):
raise ValueError("Length of table_areas and columns"
" should be equal")
def record_parse_metadata(self, table):
"""Record data about the origin of the table
"""
super().record_parse_metadata(table)
# for plotting
table._bbox = self.table_bbox
table._segments = None
def _generate_table(self, table_idx, cols, rows, **kwargs):
table = self._initialize_new_table(table_idx, cols, rows)
table = table.set_all_edges()
self.record_parse_metadata(table)
return table
def record_parse_metadata(self, table):
"""Record data about the origin of the table
"""
super().record_parse_metadata(table)
# for plotting
table._segments = None

View File

@ -0,0 +1,221 @@
# -*- coding: utf-8 -*-
from ..utils import (
bboxes_overlap,
boundaries_to_split_lines,
)
from .base import BaseParser
from .network import Network
from .lattice import Lattice
class Hybrid(BaseParser):
"""Defines a hybrid parser, leveraging both network and lattice parsers.
Parameters
----------
table_regions : list, optional (default: None)
List of page regions that may contain tables of the form x1,y1,x2,y2
where (x1, y1) -> left-top and (x2, y2) -> right-bottom
in PDF coordinate space.
table_areas : list, optional (default: None)
List of table area strings of the form x1,y1,x2,y2
where (x1, y1) -> left-top and (x2, y2) -> right-bottom
in PDF coordinate space.
columns : list, optional (default: None)
List of column x-coordinates strings where the coordinates
are comma-separated.
split_text : bool, optional (default: False)
Split text that spans across multiple cells.
flag_size : bool, optional (default: False)
Flag text based on font size. Useful to detect
super/subscripts. Adds <s></s> around flagged text.
strip_text : str, optional (default: '')
Characters that should be stripped from a string before
assigning it to a cell.
edge_tol : int, optional (default: 50)
Tolerance parameter for extending textedges vertically.
row_tol : int, optional (default: 2)
Tolerance parameter used to combine text vertically,
to generate rows.
column_tol : int, optional (default: 0)
Tolerance parameter used to combine text horizontally,
to generate columns.
"""
def __init__(
self,
table_regions=None,
table_areas=None,
columns=None,
flag_size=False,
split_text=False,
strip_text="",
edge_tol=None,
row_tol=2,
column_tol=0,
debug=False,
**kwargs):
super().__init__(
"hybrid",
table_regions=table_regions,
table_areas=table_areas,
flag_size=flag_size,
split_text=split_text,
strip_text=strip_text,
debug=debug,
)
self.network_parser = Network(
table_regions=table_regions,
table_areas=table_areas,
columns=columns,
flag_size=flag_size,
split_text=split_text,
strip_text=strip_text,
edge_tol=edge_tol,
row_tol=row_tol,
column_tol=column_tol,
debug=debug,
)
self.lattice_parser = Lattice(
table_regions=table_regions,
table_areas=table_areas,
flag_size=flag_size,
split_text=split_text,
strip_text=strip_text,
edge_tol=edge_tol,
row_tol=row_tol,
column_tol=column_tol,
debug=debug,
)
def prepare_page_parse(self, filename, layout, dimensions,
page_idx, layout_kwargs):
super().prepare_page_parse(filename, layout, dimensions,
page_idx, layout_kwargs)
self.network_parser.prepare_page_parse(
filename, layout, dimensions, page_idx, layout_kwargs)
self.lattice_parser.prepare_page_parse(
filename, layout, dimensions, page_idx, layout_kwargs)
def _generate_columns_and_rows(self, bbox, table_idx):
parser = self.table_bbox_parses[bbox]
return parser._generate_columns_and_rows(bbox, table_idx)
def _generate_table(self, table_idx, cols, rows, **kwargs):
bbox = self.table_bboxes()[table_idx]
parser = self.table_bbox_parses[bbox]
return parser._generate_table(table_idx, cols, rows, **kwargs)
@staticmethod
def _augment_boundaries_with_splits(boundaries, splits, tolerance=0):
""" Augment existing boundaries using provided hard splits.
Boundaries: |---| |-| |---------|
Splits: | | | |
Augmented: |-------|-----|-------|--|
"""
idx_boundaries = len(boundaries) - 1
idx_splits = len(splits) - 1
previous_boundary = None
while True:
if idx_splits < 0:
# No more splits to incorporate, we're done
break
split = splits[idx_splits]
if idx_boundaries < 0:
# Need to insert remaining splits
new_boundary = [split, boundaries[0][0]]
boundaries.insert(0, new_boundary)
idx_splits = idx_splits - 1
else:
boundary = \
boundaries[idx_boundaries]
if boundary[1] < \
split + tolerance:
# The lattice column is further to the right of our
# col boundary. We move our left boundary to match.
boundary[1] = split
# And if there was another segment after, we make its
# right boundary match as well so that there's no gap
if previous_boundary is not None:
previous_boundary[0] = split
idx_splits = idx_splits - 1
elif boundary[0] > \
split - tolerance:
# Our boundary is fully after the split, move on
idx_boundaries = idx_boundaries - 1
previous_boundary = boundary
else:
# The split is inside our boundary: split it
new_boundary = [split, boundary[1]]
boundaries.insert(idx_boundaries + 1, new_boundary)
boundary[1] = split
previous_boundary = new_boundary
idx_splits = idx_splits - 1
return boundaries
def _merge_bbox_analysis(self, lattice_bbox, network_bbox):
""" Identify splits that were only detected by lattice or by network
"""
lattice_parse = self.lattice_parser.table_bbox_parses[lattice_bbox]
lattice_cols, lattice_rows = \
lattice_parse["col_anchors"], lattice_parse["row_anchors"]
network_bbox_data = self.network_parser.table_bbox_parses[network_bbox]
network_cols_boundaries = network_bbox_data["cols_boundaries"]
# Favor hybrid, but complete or adjust its columns based on the
# splits identified by lattice.
if network_cols_boundaries is None:
self.table_bbox_parses[lattice_bbox] = self.lattice_parser
else:
network_cols_boundaries = self._augment_boundaries_with_splits(
network_cols_boundaries, lattice_cols) # self.column_tol???
augmented_bbox = (
network_cols_boundaries[0][0], network_bbox[1],
network_cols_boundaries[-1][1], network_bbox[3],
)
network_bbox_data["cols_anchors"] = \
boundaries_to_split_lines(network_cols_boundaries)
del self.network_parser.table_bbox_parses[network_bbox]
self.network_parser.table_bbox_parses[augmented_bbox] = \
network_bbox_data
self.table_bbox_parses[augmented_bbox] = self.network_parser
def _generate_table_bbox(self):
# Collect bboxes from both parsers
self.lattice_parser._generate_table_bbox()
_lattice_bboxes = sorted(
self.lattice_parser.table_bbox_parses,
key=lambda bbox: (bbox[0], -bbox[1]))
self.network_parser._generate_table_bbox()
_network_bboxes = sorted(
self.network_parser.table_bbox_parses,
key=lambda bbox: (bbox[0], -bbox[1]))
# Merge the data from both processes
for lattice_bbox in _lattice_bboxes:
merged = False
for idx in range(len(_network_bboxes)-1, -1, -1):
network_bbox = _network_bboxes[idx]
if not bboxes_overlap(lattice_bbox, network_bbox):
continue
self._merge_bbox_analysis(lattice_bbox, network_bbox)
# network_bbox_data["cols_boundaries"]
del _network_bboxes[idx]
merged = True
if not merged:
self.table_bbox_parses[lattice_bbox] = self.lattice_parser
# Add the bboxes from network that haven't been merged
for network_bbox in _network_bboxes:
self.table_bbox_parses[network_bbox] = self.network_parser
def record_parse_metadata(self, table):
super().record_parse_metadata(table)

View File

@ -2,8 +2,6 @@
from __future__ import division
import os
import copy
from .base import BaseParser
from ..utils import (
@ -173,7 +171,6 @@ class Lattice(BaseParser):
super().record_parse_metadata(table)
# for plotting
table._image = self.pdf_image # Reuse the image used for calc
table._bbox_unscaled = self.table_bbox_unscaled
table._segments = (self.vertical_segments, self.horizontal_segments)
def _generate_table_bbox(self):
@ -193,7 +190,7 @@ class Lattice(BaseParser):
os.path.basename(self.filename),
".png"
)
export_pdf_as_png(self.filename, self.image_path)
export_pdf_as_png(self.filename, self.image_path, self.resolution)
self.pdf_image, self.threshold = adaptive_threshold(
self.image_path,
process_background=self.process_background,
@ -250,17 +247,59 @@ class Lattice(BaseParser):
areas = scale_areas(self.table_areas)
table_bbox = find_joints(areas, vertical_mask, horizontal_mask)
self.table_bbox_unscaled = copy.deepcopy(table_bbox)
[
self.table_bbox,
self.table_bbox_parses,
self.vertical_segments,
self.horizontal_segments
] = scale_image(
table_bbox, vertical_segments, horizontal_segments, pdf_scalers
)
def _generate_columns_and_rows(self, bbox, table_idx):
for bbox, parse in self.table_bbox_parses.items():
joints = parse["joints"]
# Merge x coordinates that are close together
line_tol = self.line_tol
# Sort the joints, make them a list of lists (instead of sets)
joints_normalized = list(
map(
lambda x: list(x),
sorted(joints, key=lambda j: - j[0])
)
)
for idx in range(1, len(joints_normalized)):
x_left, x_right = \
joints_normalized[idx-1][0], joints_normalized[idx][0]
if x_left - line_tol <= x_right <= x_left + line_tol:
joints_normalized[idx][0] = x_left
# Merge y coordinates that are close together
joints_normalized = sorted(joints_normalized, key=lambda j: -j[1])
for idx in range(1, len(joints_normalized)):
y_bottom, y_top = \
joints_normalized[idx-1][1], joints_normalized[idx][1]
if y_bottom - line_tol <= y_top <= y_bottom + line_tol:
joints_normalized[idx][1] = y_bottom
# FRHTODO: check this is useful, otherwise get rid of the code
# above
parse["joints_normalized"] = joints_normalized
cols = list(map(lambda coords: coords[0], joints))
cols.extend([bbox[0], bbox[2]])
rows = list(map(lambda coords: coords[1], joints))
rows.extend([bbox[1], bbox[3]])
# sort horizontal and vertical segments
cols = merge_close_lines(sorted(cols), line_tol=self.line_tol)
rows = merge_close_lines(
sorted(rows, reverse=True),
line_tol=self.line_tol
)
parse["col_anchors"] = cols
parse["row_anchors"] = rows
def _generate_columns_and_rows(self, bbox, user_cols):
# select elements which lie within table_bbox
v_s, h_s = segments_in_bbox(
bbox, self.vertical_segments, self.horizontal_segments
@ -270,21 +309,17 @@ class Lattice(BaseParser):
self.horizontal_text,
self.vertical_text
)
parse = self.table_bbox_parses[bbox]
cols, rows = zip(*self.table_bbox[bbox])
cols, rows = list(cols), list(rows)
cols.extend([bbox[0], bbox[2]])
rows.extend([bbox[1], bbox[3]])
# sort horizontal and vertical segments
cols = merge_close_lines(sorted(cols), line_tol=self.line_tol)
rows = merge_close_lines(
sorted(rows, reverse=True),
line_tol=self.line_tol
)
# make grid using x and y coord of shortlisted rows and cols
cols = [(cols[i], cols[i + 1]) for i in range(0, len(cols) - 1)]
rows = [(rows[i], rows[i + 1]) for i in range(0, len(rows) - 1)]
cols = [
(parse["col_anchors"][i], parse["col_anchors"][i + 1])
for i in range(0, len(parse["col_anchors"]) - 1)
]
rows = [
(parse["row_anchors"][i], parse["row_anchors"][i + 1])
for i in range(0, len(parse["row_anchors"]) - 1)
]
return cols, rows, v_s, h_s
def _generate_table(self, table_idx, cols, rows, **kwargs):

View File

@ -19,7 +19,8 @@ from ..utils import (
text_in_bbox,
textlines_overlapping_bbox,
bbox_from_textlines,
find_columns_coordinates,
find_columns_boundaries,
boundaries_to_split_lines,
text_in_bbox_per_axis,
)
@ -438,7 +439,7 @@ class TextNetworks(TextAlignments):
tls_search_space.remove(most_aligned_tl)
tls_in_bbox = [most_aligned_tl]
last_bbox = None
last_cols_cand = [most_aligned_tl.x0, most_aligned_tl.x1]
last_cols_bounds = [(most_aligned_tl.x0, most_aligned_tl.x1)]
while last_bbox != bbox:
if parse_details_search is not None:
# Store debug info
@ -479,9 +480,9 @@ class TextNetworks(TextAlignments):
# of the new row won't reduce the number of columns.
# This happens when text covers multiple rows - that's only
# allowed in the header, treated separately.
cols_cand = find_columns_coordinates(tls_in_new_box)
cols_bounds = find_columns_boundaries(tls_in_new_box)
if direction in ["bottom", "top"] and \
len(cols_cand) < len(last_cols_cand):
len(cols_bounds) < len(last_cols_bounds):
continue
# We have an expansion candidate: register it, update the
@ -489,7 +490,7 @@ class TextNetworks(TextAlignments):
# We use bbox_from_textlines instead of cand_bbox in case some
# overlapping textlines require a large bbox for strict fit.
bbox = cand_bbox = list(bbox_from_textlines(tls_in_new_box))
last_cols_cand = cols_cand
last_cols_bounds = cols_bounds
tls_in_bbox.extend(new_tls)
for i in range(len(tls_search_space) - 1, -1, -1):
textline = tls_search_space[i]
@ -591,7 +592,7 @@ class Network(TextBaseParser):
textlines = self._apply_regions_filter(all_textlines)
textlines_processed = {}
self.table_bbox = {}
self.table_bbox_parses = {}
if self.parse_details is not None:
parse_details_network_searches = []
self.parse_details["network_searches"] = \
@ -641,7 +642,8 @@ class Network(TextBaseParser):
# Get all the textlines that overlap with the box, compute
# columns
tls_in_bbox = textlines_overlapping_bbox(bbox_body, textlines)
cols_anchors = find_columns_coordinates(tls_in_bbox)
cols_boundaries = find_columns_boundaries(tls_in_bbox)
cols_anchors = boundaries_to_split_lines(cols_boundaries)
# Unless the user gave us strict bbox_body, try to find a header
# above the body to build the full bbox.
@ -662,10 +664,11 @@ class Network(TextBaseParser):
table_parse = {
"bbox_body": bbox_body,
"cols_boundaries": cols_boundaries,
"cols_anchors": cols_anchors,
"bbox_full": bbox_full
}
self.table_bbox[bbox_full] = table_parse
self.table_bbox_parses[bbox_full] = table_parse
if self.parse_details is not None:
self.parse_details["col_searches"].append(table_parse)
@ -678,7 +681,7 @@ class Network(TextBaseParser):
textlines
))
def _generate_columns_and_rows(self, bbox, table_idx):
def _generate_columns_and_rows(self, bbox, user_cols):
# select elements which lie within table_bbox
self.t_bbox = text_in_bbox_per_axis(
bbox,
@ -706,18 +709,14 @@ class Network(TextBaseParser):
rows_grouped = self._group_rows(all_tls, row_tol=self.row_tol)
rows = self._join_rows(rows_grouped, text_y_max, text_y_min)
if self.columns is not None and self.columns[table_idx] != "":
# user has to input boundary columns too
# take (0, pdf_width) by default
# similar to else condition
# len can't be 1
cols = self.columns[table_idx].split(",")
cols = [float(c) for c in cols]
cols.insert(0, text_x_min)
cols.append(text_x_max)
cols = [(cols[i], cols[i + 1]) for i in range(0, len(cols) - 1)]
if user_cols is not None:
cols = [text_x_min] + user_cols + [text_x_max]
cols = [
(cols[i], cols[i + 1])
for i in range(0, len(cols) - 1)
]
else:
parse_details = self.table_bbox[bbox]
parse_details = self.table_bbox_parses[bbox]
col_anchors = parse_details["cols_anchors"]
cols = list(map(
lambda idx: [col_anchors[idx], col_anchors[idx + 1]],

View File

@ -122,14 +122,14 @@ class Stream(TextBaseParser):
self.horizontal_text)
hor_text.extend(region_text)
# find tables based on nurminen's detection algorithm
table_bbox = self._nurminen_table_detection(hor_text)
table_bbox_parses = self._nurminen_table_detection(hor_text)
else:
table_bbox = {}
table_bbox_parses = {}
for area_str in self.table_areas:
table_bbox[bbox_from_str(area_str)] = None
self.table_bbox = table_bbox
table_bbox_parses[bbox_from_str(area_str)] = None
self.table_bbox_parses = table_bbox_parses
def _generate_columns_and_rows(self, bbox, table_idx):
def _generate_columns_and_rows(self, bbox, user_cols):
# select elements which lie within table_bbox
self.t_bbox = text_in_bbox_per_axis(
bbox,
@ -140,26 +140,18 @@ class Stream(TextBaseParser):
text_x_min, text_y_min, text_x_max, text_y_max = bbox_from_textlines(
self.t_bbox["horizontal"] + self.t_bbox["vertical"]
)
# FRHTODO:
# This algorithm takes the horizontal textlines in the bbox, and groups
# them into rows based on their bottom y0.
# That's wrong: it misses the vertical items, and misses out on all
# the alignment identification work we've done earlier.
rows_grouped = self._group_rows(
self.t_bbox["horizontal"], row_tol=self.row_tol)
rows = self._join_rows(rows_grouped, text_y_max, text_y_min)
elements = [len(r) for r in rows_grouped]
if self.columns is not None and self.columns[table_idx] != "":
# user has to input boundary columns too
# take (0, pdf_width) by default
# similar to else condition
# len can't be 1
cols = self.columns[table_idx].split(",")
cols = [float(c) for c in cols]
cols.insert(0, text_x_min)
cols.append(text_x_max)
cols = [(cols[i], cols[i + 1]) for i in range(0, len(cols) - 1)]
if user_cols is not None:
cols = [text_x_min] + user_cols + [text_x_max]
cols = [
(cols[i], cols[i + 1])
for i in range(0, len(cols) - 1)
]
else:
# calculate mode of the list of number of elements in
# each row to guess the number of columns
@ -175,8 +167,8 @@ class Stream(TextBaseParser):
ncols = max(set(elements), key=elements.count)
else:
warnings.warn(
"No tables found in table area {}"
.format(table_idx + 1)
"No tables found in table area {bbox}".format(
bbox=bbox)
)
cols = [
(t.x0, t.x1)

View File

@ -74,7 +74,7 @@ def draw_labeled_bbox(
)
def draw_pdf(table, ax, to_pdf_scale=True):
def draw_pdf(table, ax):
"""Draw the content of the table's source pdf into the passed subplot
Parameters
@ -83,14 +83,9 @@ def draw_pdf(table, ax, to_pdf_scale=True):
ax : matplotlib.axes.Axes (optional)
to_pdf_scale : bool (optional)
"""
img = table.get_pdf_image()
if to_pdf_scale:
ax.imshow(img, extent=(0, table.pdf_size[0], 0, table.pdf_size[1]))
else:
ax.imshow(img)
ax.imshow(img, extent=(0, table.pdf_size[0], 0, table.pdf_size[1]))
def draw_parse_constraints(table, ax):
@ -132,8 +127,6 @@ def draw_text(table, ax):
table : camelot.core.Table
ax : matplotlib.axes.Axes (optional)
ax : matplotlib.axes.Axes
"""
bbox = bbox_from_textlines(table.textlines)
for t in table.textlines:
@ -150,18 +143,14 @@ def draw_text(table, ax):
extend_axe_lim(ax, bbox)
def prepare_plot(table, ax=None, to_pdf_scale=True):
def prepare_plot(table, ax=None):
"""Initialize plot and draw common components
Parameters
----------
table : camelot.core.Table
ax : matplotlib.axes.Axes (optional)
to_pdf_scale :
ax : matplotlib.axes.Axes
to_pdf_scale : bool (optional)
Returns
-------
@ -170,7 +159,7 @@ def prepare_plot(table, ax=None, to_pdf_scale=True):
if ax is None:
fig = plt.figure()
ax = fig.add_subplot(111, aspect="equal")
draw_pdf(table, ax, to_pdf_scale)
draw_pdf(table, ax)
draw_parse_constraints(table, ax)
return ax
@ -186,7 +175,8 @@ class PlotMethods():
table: camelot.core.Table
A Camelot Table.
kind : str, optional (default: 'text')
{'text', 'grid', 'contour', 'joint', 'line'}
{'text', 'grid', 'contour', 'joint', 'line',
'network_table_search'}
The element type for which a plot should be generated.
filepath: str, optional (default: None)
Absolute path for saving the generated plot.
@ -203,9 +193,12 @@ class PlotMethods():
raise NotImplementedError(
"Lattice flavor does not support kind='{}'".format(kind)
)
if table.flavor in ["stream", "network"] and kind in ["line"]:
if table.flavor != "lattice" and kind in ["line"]:
raise NotImplementedError(
"Stream flavor does not support kind='{}'".format(kind)
"{flavor} flavor does not support kind='{kind}'".format(
flavor=table.flavor,
kind=kind
)
)
plot_method = getattr(self, kind)
@ -274,25 +267,21 @@ class PlotMethods():
"""
_FOR_LATTICE = table.flavor == "lattice"
ax = prepare_plot(table, ax, to_pdf_scale=not _FOR_LATTICE)
if _FOR_LATTICE:
table_bbox = table._bbox_unscaled
else:
table_bbox = {table._bbox: None}
ax = prepare_plot(table, ax)
if not _FOR_LATTICE:
draw_text(table, ax)
for t in table_bbox.keys():
ax.add_patch(
patches.Rectangle(
(t[0], t[1]), t[2] - t[0], t[3] - t[1],
fill=False, color="red"
)
ax.add_patch(
patches.Rectangle(
(table._bbox[0], table._bbox[1]),
table._bbox[2] - table._bbox[0],
table._bbox[3] - table._bbox[1],
fill=False, color="red"
)
if not _FOR_LATTICE:
extend_axe_lim(ax, t)
)
if not _FOR_LATTICE:
extend_axe_lim(ax, table._bbox)
return ax.get_figure()
@ -393,14 +382,12 @@ class PlotMethods():
fig : matplotlib.fig.Figure
"""
ax = prepare_plot(table, ax, to_pdf_scale=False)
table_bbox = table._bbox_unscaled
ax = prepare_plot(table, ax)
x_coord = []
y_coord = []
for k in table_bbox.keys():
for coord in table_bbox[k]:
x_coord.append(coord[0])
y_coord.append(coord[1])
for coord in table.parse["joints"]:
x_coord.append(coord[0])
y_coord.append(coord[1])
ax.plot(x_coord, y_coord, "ro")
return ax.get_figure()

View File

@ -297,8 +297,9 @@ def scale_image(tables, v_segments, h_segments, factors):
j_x, j_y = zip(*tables[k])
j_x = [scale(j, scaling_factor_x) for j in j_x]
j_y = [scale(abs(translate(-img_y, j)), scaling_factor_y) for j in j_y]
joints = zip(j_x, j_y)
tables_new[(x1, y1, x2, y2)] = joints
tables_new[(x1, y1, x2, y2)] = {
"joints": list(zip(j_x, j_y))
}
v_segments_new = []
for v in v_segments:
@ -434,6 +435,16 @@ def bbox_from_str(bbox_str):
)
def bboxes_overlap(bbox1, bbox2):
(left1, bottom1, right1, top1) = bbox1
(left2, bottom2, right2, top2) = bbox2
return (
(left1 < left2 < right1) or (left1 < right2 < right1)
) and (
(bottom1 < bottom2 < top1) or (bottom1 < top2 < top1)
)
def textlines_overlapping_bbox(bbox, textlines):
"""Returns all text objects which overlap or are within a bounding box.
@ -451,12 +462,10 @@ def textlines_overlapping_bbox(bbox, textlines):
List of PDFMiner text objects.
"""
(left, bottom, right, top) = bbox
t_bbox = [
t
for t in textlines
if ((left < t.x0 < right) or (left < t.x1 < right))
and ((bottom < t.y0 < top) or (bottom < t.y1 < top))
if bboxes_overlap(bbox, (t.x0, t.y0, t.x1, t.y1))
]
return t_bbox
@ -560,27 +569,25 @@ def bbox_from_textlines(textlines):
return bbox
def find_columns_coordinates(tls, min_gap=1.0):
"""Given a list of text objects, guess columns boundaries and returns a
list of x-coordinates for split points between columns.
def find_columns_boundaries(tls, min_gap=1.0):
"""Make a list of disjunct cols boundaries for a list of text objects
Parameters
----------
tls : list of PDFMiner text object.
min_gap : minimum distance between columns. Any elements closer than this
threshold are merged together. This is to prevent spaces between words
to be misinterpreted as column boundaries.
min_gap : minimum distance between columns. Any elements closer than
this threshold are merged together. This is to prevent spaces between
words to be misinterpreted as boundaries.
Returns
-------
cols_anchors : list
List of x-coordinates for columns.
boundaries : list
List x-coordinates for cols.
[(1st col left, 1st col right), (2nd col left, 2nd col right), ...]
"""
# Make a list of disjunct cols boundaries across the textlines
# that comprise the table.
# [(1st col left, 1st col right), (2nd col left, 2nd col right), ...]
cols_bounds = []
tls.sort(key=lambda tl: tl.x0)
for tl in tls:
@ -588,18 +595,64 @@ def find_columns_coordinates(tls, min_gap=1.0):
cols_bounds.append([tl.x0, tl.x1])
else:
cols_bounds[-1][1] = max(cols_bounds[-1][1], tl.x1)
return cols_bounds
def find_rows_boundaries(tls, min_gap=1.0):
"""Make a list of disjunct rows boundaries for a list of text objects
Parameters
----------
tls : list of PDFMiner text object.
min_gap : minimum distance between rows. Any elements closer than
this threshold are merged together.
Returns
-------
boundaries : list
List y-coordinates for rows.
[(1st row bottom, 1st row top), (2nd row bottom, 2nd row top), ...]
"""
rows_bounds = []
tls.sort(key=lambda tl: tl.y0)
for tl in tls:
if (not rows_bounds) or rows_bounds[-1][1] + min_gap < tl.y0:
rows_bounds.append([tl.y0, tl.y1])
else:
rows_bounds[-1][1] = max(rows_bounds[-1][1], tl.y1)
return rows_bounds
def boundaries_to_split_lines(boundaries):
"""Find split lines given a list of boundaries between rows or cols.
Boundaries: [ a ] [b] [ c ] [d]
Splits: | | | | |
Parameters
----------
boundaries : list
List of tuples of x- (for columns) or y- (for rows) coord boundaries.
These are the (left, right most) or (bottom, top most) coordinates.
Returns
-------
anchors : list
List of coordinates representing the split points, each half way
between boundaries
"""
# From the row boundaries, identify splits by getting the mid points
# between the boundaries.
# Row boundaries: [ a ] [b] [ c ]
# Splits: | | | |
cols_anchors = list(map(
lambda idx: (cols_bounds[idx-1][1] + cols_bounds[idx][0]) / 2.0,
range(1, len(cols_bounds))
anchors = list(map(
lambda idx: (boundaries[idx-1][1] + boundaries[idx][0]) / 2.0,
range(1, len(boundaries))
))
cols_anchors.insert(0, cols_bounds[0][0])
cols_anchors.append(cols_bounds[-1][1])
return cols_anchors
anchors.insert(0, boundaries[0][0])
anchors.append(boundaries[-1][1])
return anchors
def get_index_closest_point(point, sorted_list, fn=lambda x: x):
@ -1129,17 +1182,20 @@ def get_text_objects(layout, ltype="char", t=None):
return t
def export_pdf_as_png(pdf_path, destination_path):
def export_pdf_as_png(pdf_path, destination_path, resolution=300):
"""Generate an image from a pdf.
Parameters
----------
pdf_path : str
destination_path : str
resolution : int
"""
gs_call = "-q -sDEVICE=png16m -o {destination_path} -r300 {pdf_path}"\
gs_call = "-q -sDEVICE=png16m -o " \
"{destination_path} -r{resolution} {pdf_path}" \
.format(
destination_path=destination_path,
resolution=resolution,
pdf_path=pdf_path
)
gs_call = gs_call.encode().split()

File diff suppressed because one or more lines are too long

View File

@ -2074,6 +2074,502 @@ data_network_vertical_headers = [
],
]
# Compared to network, hybrid detects additional sparse columns
data_hybrid_vertical_headers = [
[
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"Congress-",
"",
"",
"Senator 36th",
"",
"Rep106th",
"",
"Reg. of",
"",
"Road",
"",
"",
"Distri",
"Dist",
"",
"",
"Dist",
],
[
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"1st Dist",
"Dist.",
"",
"",
"Dist.",
"Deeds",
"",
"Commission",
"",
"District #1",
"",
"ct #2",
"#3",
"Dist #4",
"",
"#5",
],
[
"",
"",
"",
"",
"",
"Governor",
"",
"",
"U.S. Senator",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
],
[
"",
"Number of Registered voters",
"Poll Book Totals",
"Brian Calley",
"Patrick Colbeck",
"Jim Hines",
"Bill Schuette",
"John James",
"Sandy Pensler",
"",
"Jack Bergman",
"",
"Jim Stamas",
"",
"Sue Allor",
"",
"Melissa A. Cordes",
"",
"Al Scully",
"",
"Daniel G. Gauthier",
"Craig M. Clemens",
"Craig Johnston",
"Carolyn Brummund",
"Adam Brege",
"David Bielusiak",
"",
],
[
"Alcona",
"963",
"439",
"55",
"26",
"47",
"164",
"173",
"111",
"",
"268",
"",
"272",
"",
"275",
"",
"269",
"",
"271",
"",
"224",
"76",
"",
"",
"",
"",
"",
],
[
"Caledonia",
"923",
"393",
"40",
"23",
"45",
"158",
"150",
"103",
"",
"244",
"",
"247",
"",
"254",
"",
"255",
"",
"244",
"",
"139",
"143",
"",
"",
"",
"",
"",
],
[
"Curtis",
"1026",
"349",
"30",
"30",
"25",
"102",
"95",
"84",
"",
"159",
"",
"164",
"",
"162",
"",
"161",
"",
"157",
"",
"",
"",
"",
"",
"",
"",
"",
],
[
"Greenbush",
"1212",
"423",
"56",
"26",
"40",
"126",
"104",
"131",
"",
"208",
"",
"213",
"",
"214",
"",
"215",
"",
"208",
"",
"",
"",
"",
"208",
"",
"",
"",
],
[
"Gustin",
"611",
"180",
"22",
"35",
"17",
"55",
"73",
"45",
"",
"108",
"",
"104",
"",
"111",
"",
"111",
"",
"109",
"",
"",
"",
"",
"",
"81",
"42",
"",
],
[
"Harrisville",
"1142",
"430",
"45",
"90",
"29",
"101",
"155",
"94",
"",
"226",
"",
"226",
"",
"232",
"",
"244",
"",
"226",
"",
"",
"",
"232",
"",
"",
"",
"",
],
[
"Hawes",
"884",
"293",
"38",
"36",
"27",
"109",
"121",
"84",
"",
"192",
"",
"195",
"",
"195",
"",
"193",
"",
"184",
"",
"",
"",
"",
"",
"118",
"87",
"",
],
[
"Haynes",
"626",
"275",
"31",
"20",
"32",
"104",
"121",
"53",
"",
"163",
"",
"163",
"",
"173",
"",
"161",
"",
"152",
"",
"",
"",
"76",
"",
"69",
"31",
"",
],
[
"Mikado",
"781",
"208",
"19",
"39",
"17",
"81",
"90",
"63",
"",
"149",
"",
"149",
"",
"145",
"",
"147",
"",
"143",
"",
"",
"",
"",
"113",
"",
"",
"",
],
[
"Millen",
"353",
"139",
"7",
"16",
"13",
"38",
"49",
"19",
"",
"62",
"",
"66",
"",
"67",
"",
"66",
"",
"62",
"",
"",
"",
"",
"",
"",
"",
"",
],
[
"Mitchell",
"327",
"96",
"12",
"17",
"7",
"29",
"41",
"17",
"",
"57",
"",
"55",
"",
"57",
"",
"60",
"",
"56",
"",
"",
"",
"",
"",
"",
"",
"",
],
[
"City Harrisville",
"389",
"171",
"16",
"15",
"18",
"35",
"49",
"31",
"",
"78",
"",
"80",
"",
"82",
"",
"81",
"",
"77",
"",
"",
"",
"73",
"",
"",
"",
"",
],
[
"Totals",
"9237",
"3396",
"371",
"373",
"317",
"1102",
"1221",
"835",
"0",
"1914",
"0",
"1934",
"",
"1967",
"",
"1963",
"0",
"1889",
"0",
"363",
"219",
"381",
"321",
"268",
"160",
"0",
],
]
data_stream_table_areas = [

Binary file not shown.

Before

Width:  |  Height:  |  Size: 33 KiB

After

Width:  |  Height:  |  Size: 46 KiB

View File

@ -291,6 +291,19 @@ def test_network_layout_kwargs():
assert_frame_equal(df, tables[0].df)
# Hybrid parser
def test_hybrid_vertical_header():
"""Tests a complex table with a vertically text header.
"""
df = pd.DataFrame(data_hybrid_vertical_headers)
filename = os.path.join(testdir, "vertical_header.pdf")
tables = camelot.read_pdf(filename, flavor="hybrid")
assert len(tables) == 1
assert_frame_equal(df, tables[0].df)
# Lattice parser tests
def test_lattice():
df = pd.DataFrame(data_lattice)