WIP: Introduce actual hybrid parser
Create hybrid parser leverage both lattice and network techniques. Simplify plotting of pdf in lattice case. Rename "parser.table_bbox" into "parser.table_bbox_parses", since it represents not a bbox but a dict of bbox to corresponding parsing data. Still missing: more unit tests, plotting of steps.pull/153/head
parent
edad1efd1b
commit
4a761611bf
|
|
@ -396,7 +396,8 @@ def network(c, *args, **kwargs):
|
|||
"Please specify output file format using --format")
|
||||
|
||||
tables = read_pdf(
|
||||
filepath, pages=pages, flavor="network", suppress_stdout=quiet, **kwargs
|
||||
filepath, pages=pages, flavor="network",
|
||||
suppress_stdout=quiet, **kwargs
|
||||
)
|
||||
click.echo("Found {} tables".format(tables.n))
|
||||
if plot_type is not None:
|
||||
|
|
|
|||
|
|
@ -454,7 +454,9 @@ class Table():
|
|||
self.page = None
|
||||
self.flavor = None # Flavor of the parser used
|
||||
self.pdf_size = None # Dimensions of the original PDF page
|
||||
self.parse_details = None # Field holding debug data
|
||||
self._bbox = None # Bounding box in original document
|
||||
self.parse = None # Parse information
|
||||
self.parse_details = None # Field holding extra debug data
|
||||
|
||||
self._image = None
|
||||
self._image_path = None # Temporary file to hold an image of the pdf
|
||||
|
|
|
|||
|
|
@ -7,7 +7,7 @@ import logging
|
|||
from PyPDF2 import PdfFileReader, PdfFileWriter
|
||||
|
||||
from .core import TableList
|
||||
from .parsers import Stream, Lattice, Network
|
||||
from .parsers import Stream, Lattice, Network, Hybrid
|
||||
from .utils import (
|
||||
build_file_path_in_temp_dir,
|
||||
get_page_layout,
|
||||
|
|
@ -23,6 +23,7 @@ PARSERS = {
|
|||
"lattice": Lattice,
|
||||
"stream": Stream,
|
||||
"network": Network,
|
||||
"hybrid": Hybrid,
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -177,7 +178,8 @@ class PDFHandler():
|
|||
Parameters
|
||||
----------
|
||||
flavor : str (default: 'lattice')
|
||||
The parsing method to use ('lattice', 'stream', or 'network').
|
||||
The parsing method to use ('lattice', 'stream', 'network',
|
||||
or 'hybrid').
|
||||
Lattice is used by default.
|
||||
suppress_stdout : str (default: False)
|
||||
Suppress logs and warnings.
|
||||
|
|
|
|||
|
|
@ -6,7 +6,9 @@ import cv2
|
|||
import numpy as np
|
||||
|
||||
|
||||
def adaptive_threshold(imagename, process_background=False, blocksize=15, c=-2):
|
||||
def adaptive_threshold(
|
||||
imagename, process_background=False,
|
||||
blocksize=15, c=-2):
|
||||
"""Thresholds an image using OpenCV's adaptiveThreshold.
|
||||
|
||||
Parameters
|
||||
|
|
@ -19,12 +21,12 @@ def adaptive_threshold(imagename, process_background=False, blocksize=15, c=-2):
|
|||
Size of a pixel neighborhood that is used to calculate a
|
||||
threshold value for the pixel: 3, 5, 7, and so on.
|
||||
|
||||
For more information, refer `OpenCV's adaptiveThreshold <https://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html#adaptivethreshold>`_.
|
||||
For more information, refer `OpenCV's adaptiveThreshold <https://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html#adaptivethreshold>`_. # noqa
|
||||
c : int, optional (default: -2)
|
||||
Constant subtracted from the mean or weighted mean.
|
||||
Normally, it is positive but may be zero or negative as well.
|
||||
|
||||
For more information, refer `OpenCV's adaptiveThreshold <https://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html#adaptivethreshold>`_.
|
||||
For more information, refer `OpenCV's adaptiveThreshold <https://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html#adaptivethreshold>`_. # noqa
|
||||
|
||||
Returns
|
||||
-------
|
||||
|
|
@ -39,7 +41,10 @@ def adaptive_threshold(imagename, process_background=False, blocksize=15, c=-2):
|
|||
|
||||
if process_background:
|
||||
threshold = cv2.adaptiveThreshold(
|
||||
gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, blocksize, c
|
||||
gray,
|
||||
255,
|
||||
cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
|
||||
cv2.THRESH_BINARY, blocksize, c
|
||||
)
|
||||
else:
|
||||
threshold = cv2.adaptiveThreshold(
|
||||
|
|
@ -54,7 +59,8 @@ def adaptive_threshold(imagename, process_background=False, blocksize=15, c=-2):
|
|||
|
||||
|
||||
def find_lines(
|
||||
threshold, regions=None, direction="horizontal", line_scale=15, iterations=0
|
||||
threshold, regions=None,
|
||||
direction="horizontal", line_scale=15, iterations=0
|
||||
):
|
||||
"""Finds horizontal and vertical lines by applying morphological
|
||||
transformations on an image.
|
||||
|
|
@ -78,7 +84,7 @@ def find_lines(
|
|||
iterations : int, optional (default: 0)
|
||||
Number of times for erosion/dilation is applied.
|
||||
|
||||
For more information, refer `OpenCV's dilate <https://docs.opencv.org/2.4/modules/imgproc/doc/filtering.html#dilate>`_.
|
||||
For more information, refer `OpenCV's dilate <https://docs.opencv.org/2.4/modules/imgproc/doc/filtering.html#dilate>`_. # noqa
|
||||
|
||||
Returns
|
||||
-------
|
||||
|
|
@ -100,13 +106,15 @@ def find_lines(
|
|||
size = threshold.shape[1] // line_scale
|
||||
el = cv2.getStructuringElement(cv2.MORPH_RECT, (size, 1))
|
||||
elif direction is None:
|
||||
raise ValueError("Specify direction as either 'vertical' or 'horizontal'")
|
||||
raise ValueError(
|
||||
"Specify direction as either 'vertical' or 'horizontal'"
|
||||
)
|
||||
|
||||
if regions is not None:
|
||||
region_mask = np.zeros(threshold.shape)
|
||||
for region in regions:
|
||||
x, y, w, h = region
|
||||
region_mask[y : y + h, x : x + w] = 1
|
||||
region_mask[y:y + h, x:x + w] = 1
|
||||
threshold = np.multiply(threshold, region_mask)
|
||||
|
||||
threshold = cv2.erode(threshold, el)
|
||||
|
|
@ -115,12 +123,14 @@ def find_lines(
|
|||
|
||||
try:
|
||||
_, contours, _ = cv2.findContours(
|
||||
threshold.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE
|
||||
threshold.astype(np.uint8), cv2.RETR_EXTERNAL,
|
||||
cv2.CHAIN_APPROX_SIMPLE
|
||||
)
|
||||
except ValueError:
|
||||
# for opencv backward compatibility
|
||||
contours, _ = cv2.findContours(
|
||||
threshold.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE
|
||||
threshold.astype(np.uint8), cv2.RETR_EXTERNAL,
|
||||
cv2.CHAIN_APPROX_SIMPLE
|
||||
)
|
||||
|
||||
for c in contours:
|
||||
|
|
@ -202,7 +212,7 @@ def find_joints(contours, vertical, horizontal):
|
|||
tables = {}
|
||||
for c in contours:
|
||||
x, y, w, h = c
|
||||
roi = joints[y : y + h, x : x + w]
|
||||
roi = joints[y:y + h, x:x + w]
|
||||
try:
|
||||
__, jc, __ = cv2.findContours(
|
||||
roi.astype(np.uint8), cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE
|
||||
|
|
|
|||
|
|
@ -99,7 +99,7 @@ def read_pdf(
|
|||
|
||||
"""
|
||||
layout_kwargs = layout_kwargs or {}
|
||||
if flavor not in ["lattice", "stream", "network"]:
|
||||
if flavor not in ["lattice", "stream", "network", "hybrid"]:
|
||||
raise NotImplementedError(
|
||||
"Unknown flavor specified."
|
||||
" Use either 'lattice', 'stream', or 'network'"
|
||||
|
|
|
|||
|
|
@ -3,3 +3,4 @@
|
|||
from .stream import Stream
|
||||
from .lattice import Lattice
|
||||
from .network import Network
|
||||
from .hybrid import Hybrid
|
||||
|
|
|
|||
|
|
@ -34,8 +34,9 @@ class BaseParser():
|
|||
self.id = parser_id
|
||||
self.table_regions = table_regions
|
||||
self.table_areas = table_areas
|
||||
self.table_bbox = {}
|
||||
self.table_bbox_parses = {}
|
||||
|
||||
self.columns = None
|
||||
self.copy_text = copy_text
|
||||
self.split_text = split_text
|
||||
self.strip_text = strip_text
|
||||
|
|
@ -47,10 +48,18 @@ class BaseParser():
|
|||
self.t_bbox = None
|
||||
|
||||
# For plotting details of parsing algorithms
|
||||
self.resolution = 300 # default plotting resolution of the PDF.
|
||||
self.parse_details = {}
|
||||
if not debug:
|
||||
self.parse_details = None
|
||||
|
||||
def table_bboxes(self):
|
||||
return sorted(
|
||||
self.table_bbox_parses.keys(),
|
||||
key=lambda x: x[1],
|
||||
reverse=True
|
||||
)
|
||||
|
||||
def prepare_page_parse(self, filename, layout, dimensions,
|
||||
page_idx, layout_kwargs):
|
||||
self.filename = filename
|
||||
|
|
@ -142,6 +151,7 @@ class BaseParser():
|
|||
table = Table(cols, rows)
|
||||
table.page = self.page
|
||||
table.order = table_idx + 1
|
||||
table._bbox = self.table_bboxes()[table_idx]
|
||||
return table
|
||||
|
||||
@staticmethod
|
||||
|
|
@ -177,7 +187,7 @@ class BaseParser():
|
|||
table.cells[r_idx][c_idx].text = text
|
||||
return pos_errors
|
||||
|
||||
def _generate_columns_and_rows(self, bbox, table_idx):
|
||||
def _generate_columns_and_rows(self, bbox, user_cols):
|
||||
# Pure virtual, must be defined by the derived parser
|
||||
raise NotImplementedError()
|
||||
|
||||
|
|
@ -199,20 +209,23 @@ class BaseParser():
|
|||
|
||||
_tables = []
|
||||
# sort tables based on y-coord
|
||||
for table_idx, bbox in enumerate(
|
||||
sorted(
|
||||
self.table_bbox.keys(),
|
||||
key=lambda x: x[1],
|
||||
reverse=True
|
||||
)
|
||||
):
|
||||
for table_idx, bbox in enumerate(self.table_bboxes()):
|
||||
if self.columns is not None and self.columns[table_idx] != "":
|
||||
# user has to input boundary columns too
|
||||
# take (0, pdf_width) by default
|
||||
# similar to else condition
|
||||
# len can't be 1
|
||||
user_cols = self.columns[table_idx].split(",")
|
||||
user_cols = [float(c) for c in user_cols]
|
||||
else:
|
||||
user_cols = None
|
||||
|
||||
cols, rows, v_s, h_s = self._generate_columns_and_rows(
|
||||
bbox,
|
||||
table_idx
|
||||
user_cols
|
||||
)
|
||||
table = self._generate_table(
|
||||
table_idx, cols, rows, v_s=v_s, h_s=h_s)
|
||||
table._bbox = bbox
|
||||
_tables.append(table)
|
||||
|
||||
return _tables
|
||||
|
|
@ -222,6 +235,7 @@ class BaseParser():
|
|||
"""
|
||||
table.flavor = self.id
|
||||
table.filename = self.filename
|
||||
table.parse = self.table_bbox_parses[table._bbox]
|
||||
table.parse_details = self.parse_details
|
||||
pos_errors = self.compute_parse_errors(table)
|
||||
table.accuracy = compute_accuracy([[100, pos_errors]])
|
||||
|
|
@ -453,17 +467,16 @@ class TextBaseParser(BaseParser):
|
|||
raise ValueError("Length of table_areas and columns"
|
||||
" should be equal")
|
||||
|
||||
def record_parse_metadata(self, table):
|
||||
"""Record data about the origin of the table
|
||||
"""
|
||||
super().record_parse_metadata(table)
|
||||
# for plotting
|
||||
table._bbox = self.table_bbox
|
||||
table._segments = None
|
||||
|
||||
def _generate_table(self, table_idx, cols, rows, **kwargs):
|
||||
table = self._initialize_new_table(table_idx, cols, rows)
|
||||
table = table.set_all_edges()
|
||||
self.record_parse_metadata(table)
|
||||
|
||||
return table
|
||||
|
||||
def record_parse_metadata(self, table):
|
||||
"""Record data about the origin of the table
|
||||
"""
|
||||
super().record_parse_metadata(table)
|
||||
# for plotting
|
||||
table._segments = None
|
||||
|
|
|
|||
|
|
@ -0,0 +1,221 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
from ..utils import (
|
||||
bboxes_overlap,
|
||||
boundaries_to_split_lines,
|
||||
)
|
||||
|
||||
from .base import BaseParser
|
||||
from .network import Network
|
||||
from .lattice import Lattice
|
||||
|
||||
|
||||
class Hybrid(BaseParser):
|
||||
"""Defines a hybrid parser, leveraging both network and lattice parsers.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
table_regions : list, optional (default: None)
|
||||
List of page regions that may contain tables of the form x1,y1,x2,y2
|
||||
where (x1, y1) -> left-top and (x2, y2) -> right-bottom
|
||||
in PDF coordinate space.
|
||||
table_areas : list, optional (default: None)
|
||||
List of table area strings of the form x1,y1,x2,y2
|
||||
where (x1, y1) -> left-top and (x2, y2) -> right-bottom
|
||||
in PDF coordinate space.
|
||||
columns : list, optional (default: None)
|
||||
List of column x-coordinates strings where the coordinates
|
||||
are comma-separated.
|
||||
split_text : bool, optional (default: False)
|
||||
Split text that spans across multiple cells.
|
||||
flag_size : bool, optional (default: False)
|
||||
Flag text based on font size. Useful to detect
|
||||
super/subscripts. Adds <s></s> around flagged text.
|
||||
strip_text : str, optional (default: '')
|
||||
Characters that should be stripped from a string before
|
||||
assigning it to a cell.
|
||||
edge_tol : int, optional (default: 50)
|
||||
Tolerance parameter for extending textedges vertically.
|
||||
row_tol : int, optional (default: 2)
|
||||
Tolerance parameter used to combine text vertically,
|
||||
to generate rows.
|
||||
column_tol : int, optional (default: 0)
|
||||
Tolerance parameter used to combine text horizontally,
|
||||
to generate columns.
|
||||
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
table_regions=None,
|
||||
table_areas=None,
|
||||
columns=None,
|
||||
flag_size=False,
|
||||
split_text=False,
|
||||
strip_text="",
|
||||
edge_tol=None,
|
||||
row_tol=2,
|
||||
column_tol=0,
|
||||
debug=False,
|
||||
**kwargs):
|
||||
super().__init__(
|
||||
"hybrid",
|
||||
table_regions=table_regions,
|
||||
table_areas=table_areas,
|
||||
flag_size=flag_size,
|
||||
split_text=split_text,
|
||||
strip_text=strip_text,
|
||||
debug=debug,
|
||||
)
|
||||
self.network_parser = Network(
|
||||
table_regions=table_regions,
|
||||
table_areas=table_areas,
|
||||
columns=columns,
|
||||
flag_size=flag_size,
|
||||
split_text=split_text,
|
||||
strip_text=strip_text,
|
||||
edge_tol=edge_tol,
|
||||
row_tol=row_tol,
|
||||
column_tol=column_tol,
|
||||
debug=debug,
|
||||
)
|
||||
self.lattice_parser = Lattice(
|
||||
table_regions=table_regions,
|
||||
table_areas=table_areas,
|
||||
flag_size=flag_size,
|
||||
split_text=split_text,
|
||||
strip_text=strip_text,
|
||||
edge_tol=edge_tol,
|
||||
row_tol=row_tol,
|
||||
column_tol=column_tol,
|
||||
debug=debug,
|
||||
)
|
||||
|
||||
def prepare_page_parse(self, filename, layout, dimensions,
|
||||
page_idx, layout_kwargs):
|
||||
super().prepare_page_parse(filename, layout, dimensions,
|
||||
page_idx, layout_kwargs)
|
||||
self.network_parser.prepare_page_parse(
|
||||
filename, layout, dimensions, page_idx, layout_kwargs)
|
||||
self.lattice_parser.prepare_page_parse(
|
||||
filename, layout, dimensions, page_idx, layout_kwargs)
|
||||
|
||||
def _generate_columns_and_rows(self, bbox, table_idx):
|
||||
parser = self.table_bbox_parses[bbox]
|
||||
return parser._generate_columns_and_rows(bbox, table_idx)
|
||||
|
||||
def _generate_table(self, table_idx, cols, rows, **kwargs):
|
||||
bbox = self.table_bboxes()[table_idx]
|
||||
parser = self.table_bbox_parses[bbox]
|
||||
return parser._generate_table(table_idx, cols, rows, **kwargs)
|
||||
|
||||
@staticmethod
|
||||
def _augment_boundaries_with_splits(boundaries, splits, tolerance=0):
|
||||
""" Augment existing boundaries using provided hard splits.
|
||||
|
||||
Boundaries: |---| |-| |---------|
|
||||
Splits: | | | |
|
||||
Augmented: |-------|-----|-------|--|
|
||||
"""
|
||||
idx_boundaries = len(boundaries) - 1
|
||||
idx_splits = len(splits) - 1
|
||||
previous_boundary = None
|
||||
while True:
|
||||
if idx_splits < 0:
|
||||
# No more splits to incorporate, we're done
|
||||
break
|
||||
split = splits[idx_splits]
|
||||
|
||||
if idx_boundaries < 0:
|
||||
# Need to insert remaining splits
|
||||
new_boundary = [split, boundaries[0][0]]
|
||||
boundaries.insert(0, new_boundary)
|
||||
idx_splits = idx_splits - 1
|
||||
else:
|
||||
boundary = \
|
||||
boundaries[idx_boundaries]
|
||||
if boundary[1] < \
|
||||
split + tolerance:
|
||||
# The lattice column is further to the right of our
|
||||
# col boundary. We move our left boundary to match.
|
||||
boundary[1] = split
|
||||
# And if there was another segment after, we make its
|
||||
# right boundary match as well so that there's no gap
|
||||
if previous_boundary is not None:
|
||||
previous_boundary[0] = split
|
||||
idx_splits = idx_splits - 1
|
||||
elif boundary[0] > \
|
||||
split - tolerance:
|
||||
# Our boundary is fully after the split, move on
|
||||
idx_boundaries = idx_boundaries - 1
|
||||
previous_boundary = boundary
|
||||
else:
|
||||
# The split is inside our boundary: split it
|
||||
new_boundary = [split, boundary[1]]
|
||||
boundaries.insert(idx_boundaries + 1, new_boundary)
|
||||
boundary[1] = split
|
||||
previous_boundary = new_boundary
|
||||
idx_splits = idx_splits - 1
|
||||
return boundaries
|
||||
|
||||
def _merge_bbox_analysis(self, lattice_bbox, network_bbox):
|
||||
""" Identify splits that were only detected by lattice or by network
|
||||
"""
|
||||
lattice_parse = self.lattice_parser.table_bbox_parses[lattice_bbox]
|
||||
lattice_cols, lattice_rows = \
|
||||
lattice_parse["col_anchors"], lattice_parse["row_anchors"]
|
||||
|
||||
network_bbox_data = self.network_parser.table_bbox_parses[network_bbox]
|
||||
network_cols_boundaries = network_bbox_data["cols_boundaries"]
|
||||
|
||||
# Favor hybrid, but complete or adjust its columns based on the
|
||||
# splits identified by lattice.
|
||||
if network_cols_boundaries is None:
|
||||
self.table_bbox_parses[lattice_bbox] = self.lattice_parser
|
||||
else:
|
||||
network_cols_boundaries = self._augment_boundaries_with_splits(
|
||||
network_cols_boundaries, lattice_cols) # self.column_tol???
|
||||
augmented_bbox = (
|
||||
network_cols_boundaries[0][0], network_bbox[1],
|
||||
network_cols_boundaries[-1][1], network_bbox[3],
|
||||
)
|
||||
network_bbox_data["cols_anchors"] = \
|
||||
boundaries_to_split_lines(network_cols_boundaries)
|
||||
|
||||
del self.network_parser.table_bbox_parses[network_bbox]
|
||||
self.network_parser.table_bbox_parses[augmented_bbox] = \
|
||||
network_bbox_data
|
||||
self.table_bbox_parses[augmented_bbox] = self.network_parser
|
||||
|
||||
def _generate_table_bbox(self):
|
||||
# Collect bboxes from both parsers
|
||||
self.lattice_parser._generate_table_bbox()
|
||||
_lattice_bboxes = sorted(
|
||||
self.lattice_parser.table_bbox_parses,
|
||||
key=lambda bbox: (bbox[0], -bbox[1]))
|
||||
self.network_parser._generate_table_bbox()
|
||||
_network_bboxes = sorted(
|
||||
self.network_parser.table_bbox_parses,
|
||||
key=lambda bbox: (bbox[0], -bbox[1]))
|
||||
|
||||
# Merge the data from both processes
|
||||
for lattice_bbox in _lattice_bboxes:
|
||||
merged = False
|
||||
|
||||
for idx in range(len(_network_bboxes)-1, -1, -1):
|
||||
network_bbox = _network_bboxes[idx]
|
||||
if not bboxes_overlap(lattice_bbox, network_bbox):
|
||||
continue
|
||||
self._merge_bbox_analysis(lattice_bbox, network_bbox)
|
||||
# network_bbox_data["cols_boundaries"]
|
||||
del _network_bboxes[idx]
|
||||
merged = True
|
||||
if not merged:
|
||||
self.table_bbox_parses[lattice_bbox] = self.lattice_parser
|
||||
|
||||
# Add the bboxes from network that haven't been merged
|
||||
for network_bbox in _network_bboxes:
|
||||
self.table_bbox_parses[network_bbox] = self.network_parser
|
||||
|
||||
def record_parse_metadata(self, table):
|
||||
super().record_parse_metadata(table)
|
||||
|
|
@ -2,8 +2,6 @@
|
|||
|
||||
from __future__ import division
|
||||
import os
|
||||
import copy
|
||||
|
||||
|
||||
from .base import BaseParser
|
||||
from ..utils import (
|
||||
|
|
@ -173,7 +171,6 @@ class Lattice(BaseParser):
|
|||
super().record_parse_metadata(table)
|
||||
# for plotting
|
||||
table._image = self.pdf_image # Reuse the image used for calc
|
||||
table._bbox_unscaled = self.table_bbox_unscaled
|
||||
table._segments = (self.vertical_segments, self.horizontal_segments)
|
||||
|
||||
def _generate_table_bbox(self):
|
||||
|
|
@ -193,7 +190,7 @@ class Lattice(BaseParser):
|
|||
os.path.basename(self.filename),
|
||||
".png"
|
||||
)
|
||||
export_pdf_as_png(self.filename, self.image_path)
|
||||
export_pdf_as_png(self.filename, self.image_path, self.resolution)
|
||||
self.pdf_image, self.threshold = adaptive_threshold(
|
||||
self.image_path,
|
||||
process_background=self.process_background,
|
||||
|
|
@ -250,17 +247,59 @@ class Lattice(BaseParser):
|
|||
areas = scale_areas(self.table_areas)
|
||||
table_bbox = find_joints(areas, vertical_mask, horizontal_mask)
|
||||
|
||||
self.table_bbox_unscaled = copy.deepcopy(table_bbox)
|
||||
|
||||
[
|
||||
self.table_bbox,
|
||||
self.table_bbox_parses,
|
||||
self.vertical_segments,
|
||||
self.horizontal_segments
|
||||
] = scale_image(
|
||||
table_bbox, vertical_segments, horizontal_segments, pdf_scalers
|
||||
)
|
||||
|
||||
def _generate_columns_and_rows(self, bbox, table_idx):
|
||||
for bbox, parse in self.table_bbox_parses.items():
|
||||
joints = parse["joints"]
|
||||
|
||||
# Merge x coordinates that are close together
|
||||
line_tol = self.line_tol
|
||||
# Sort the joints, make them a list of lists (instead of sets)
|
||||
joints_normalized = list(
|
||||
map(
|
||||
lambda x: list(x),
|
||||
sorted(joints, key=lambda j: - j[0])
|
||||
)
|
||||
)
|
||||
for idx in range(1, len(joints_normalized)):
|
||||
x_left, x_right = \
|
||||
joints_normalized[idx-1][0], joints_normalized[idx][0]
|
||||
if x_left - line_tol <= x_right <= x_left + line_tol:
|
||||
joints_normalized[idx][0] = x_left
|
||||
|
||||
# Merge y coordinates that are close together
|
||||
joints_normalized = sorted(joints_normalized, key=lambda j: -j[1])
|
||||
for idx in range(1, len(joints_normalized)):
|
||||
y_bottom, y_top = \
|
||||
joints_normalized[idx-1][1], joints_normalized[idx][1]
|
||||
if y_bottom - line_tol <= y_top <= y_bottom + line_tol:
|
||||
joints_normalized[idx][1] = y_bottom
|
||||
|
||||
# FRHTODO: check this is useful, otherwise get rid of the code
|
||||
# above
|
||||
parse["joints_normalized"] = joints_normalized
|
||||
|
||||
cols = list(map(lambda coords: coords[0], joints))
|
||||
cols.extend([bbox[0], bbox[2]])
|
||||
rows = list(map(lambda coords: coords[1], joints))
|
||||
rows.extend([bbox[1], bbox[3]])
|
||||
|
||||
# sort horizontal and vertical segments
|
||||
cols = merge_close_lines(sorted(cols), line_tol=self.line_tol)
|
||||
rows = merge_close_lines(
|
||||
sorted(rows, reverse=True),
|
||||
line_tol=self.line_tol
|
||||
)
|
||||
parse["col_anchors"] = cols
|
||||
parse["row_anchors"] = rows
|
||||
|
||||
def _generate_columns_and_rows(self, bbox, user_cols):
|
||||
# select elements which lie within table_bbox
|
||||
v_s, h_s = segments_in_bbox(
|
||||
bbox, self.vertical_segments, self.horizontal_segments
|
||||
|
|
@ -270,21 +309,17 @@ class Lattice(BaseParser):
|
|||
self.horizontal_text,
|
||||
self.vertical_text
|
||||
)
|
||||
parse = self.table_bbox_parses[bbox]
|
||||
|
||||
cols, rows = zip(*self.table_bbox[bbox])
|
||||
cols, rows = list(cols), list(rows)
|
||||
cols.extend([bbox[0], bbox[2]])
|
||||
rows.extend([bbox[1], bbox[3]])
|
||||
# sort horizontal and vertical segments
|
||||
cols = merge_close_lines(sorted(cols), line_tol=self.line_tol)
|
||||
rows = merge_close_lines(
|
||||
sorted(rows, reverse=True),
|
||||
line_tol=self.line_tol
|
||||
)
|
||||
# make grid using x and y coord of shortlisted rows and cols
|
||||
cols = [(cols[i], cols[i + 1]) for i in range(0, len(cols) - 1)]
|
||||
rows = [(rows[i], rows[i + 1]) for i in range(0, len(rows) - 1)]
|
||||
|
||||
cols = [
|
||||
(parse["col_anchors"][i], parse["col_anchors"][i + 1])
|
||||
for i in range(0, len(parse["col_anchors"]) - 1)
|
||||
]
|
||||
rows = [
|
||||
(parse["row_anchors"][i], parse["row_anchors"][i + 1])
|
||||
for i in range(0, len(parse["row_anchors"]) - 1)
|
||||
]
|
||||
return cols, rows, v_s, h_s
|
||||
|
||||
def _generate_table(self, table_idx, cols, rows, **kwargs):
|
||||
|
|
|
|||
|
|
@ -19,7 +19,8 @@ from ..utils import (
|
|||
text_in_bbox,
|
||||
textlines_overlapping_bbox,
|
||||
bbox_from_textlines,
|
||||
find_columns_coordinates,
|
||||
find_columns_boundaries,
|
||||
boundaries_to_split_lines,
|
||||
text_in_bbox_per_axis,
|
||||
)
|
||||
|
||||
|
|
@ -438,7 +439,7 @@ class TextNetworks(TextAlignments):
|
|||
tls_search_space.remove(most_aligned_tl)
|
||||
tls_in_bbox = [most_aligned_tl]
|
||||
last_bbox = None
|
||||
last_cols_cand = [most_aligned_tl.x0, most_aligned_tl.x1]
|
||||
last_cols_bounds = [(most_aligned_tl.x0, most_aligned_tl.x1)]
|
||||
while last_bbox != bbox:
|
||||
if parse_details_search is not None:
|
||||
# Store debug info
|
||||
|
|
@ -479,9 +480,9 @@ class TextNetworks(TextAlignments):
|
|||
# of the new row won't reduce the number of columns.
|
||||
# This happens when text covers multiple rows - that's only
|
||||
# allowed in the header, treated separately.
|
||||
cols_cand = find_columns_coordinates(tls_in_new_box)
|
||||
cols_bounds = find_columns_boundaries(tls_in_new_box)
|
||||
if direction in ["bottom", "top"] and \
|
||||
len(cols_cand) < len(last_cols_cand):
|
||||
len(cols_bounds) < len(last_cols_bounds):
|
||||
continue
|
||||
|
||||
# We have an expansion candidate: register it, update the
|
||||
|
|
@ -489,7 +490,7 @@ class TextNetworks(TextAlignments):
|
|||
# We use bbox_from_textlines instead of cand_bbox in case some
|
||||
# overlapping textlines require a large bbox for strict fit.
|
||||
bbox = cand_bbox = list(bbox_from_textlines(tls_in_new_box))
|
||||
last_cols_cand = cols_cand
|
||||
last_cols_bounds = cols_bounds
|
||||
tls_in_bbox.extend(new_tls)
|
||||
for i in range(len(tls_search_space) - 1, -1, -1):
|
||||
textline = tls_search_space[i]
|
||||
|
|
@ -591,7 +592,7 @@ class Network(TextBaseParser):
|
|||
textlines = self._apply_regions_filter(all_textlines)
|
||||
|
||||
textlines_processed = {}
|
||||
self.table_bbox = {}
|
||||
self.table_bbox_parses = {}
|
||||
if self.parse_details is not None:
|
||||
parse_details_network_searches = []
|
||||
self.parse_details["network_searches"] = \
|
||||
|
|
@ -641,7 +642,8 @@ class Network(TextBaseParser):
|
|||
# Get all the textlines that overlap with the box, compute
|
||||
# columns
|
||||
tls_in_bbox = textlines_overlapping_bbox(bbox_body, textlines)
|
||||
cols_anchors = find_columns_coordinates(tls_in_bbox)
|
||||
cols_boundaries = find_columns_boundaries(tls_in_bbox)
|
||||
cols_anchors = boundaries_to_split_lines(cols_boundaries)
|
||||
|
||||
# Unless the user gave us strict bbox_body, try to find a header
|
||||
# above the body to build the full bbox.
|
||||
|
|
@ -662,10 +664,11 @@ class Network(TextBaseParser):
|
|||
|
||||
table_parse = {
|
||||
"bbox_body": bbox_body,
|
||||
"cols_boundaries": cols_boundaries,
|
||||
"cols_anchors": cols_anchors,
|
||||
"bbox_full": bbox_full
|
||||
}
|
||||
self.table_bbox[bbox_full] = table_parse
|
||||
self.table_bbox_parses[bbox_full] = table_parse
|
||||
|
||||
if self.parse_details is not None:
|
||||
self.parse_details["col_searches"].append(table_parse)
|
||||
|
|
@ -678,7 +681,7 @@ class Network(TextBaseParser):
|
|||
textlines
|
||||
))
|
||||
|
||||
def _generate_columns_and_rows(self, bbox, table_idx):
|
||||
def _generate_columns_and_rows(self, bbox, user_cols):
|
||||
# select elements which lie within table_bbox
|
||||
self.t_bbox = text_in_bbox_per_axis(
|
||||
bbox,
|
||||
|
|
@ -706,18 +709,14 @@ class Network(TextBaseParser):
|
|||
rows_grouped = self._group_rows(all_tls, row_tol=self.row_tol)
|
||||
rows = self._join_rows(rows_grouped, text_y_max, text_y_min)
|
||||
|
||||
if self.columns is not None and self.columns[table_idx] != "":
|
||||
# user has to input boundary columns too
|
||||
# take (0, pdf_width) by default
|
||||
# similar to else condition
|
||||
# len can't be 1
|
||||
cols = self.columns[table_idx].split(",")
|
||||
cols = [float(c) for c in cols]
|
||||
cols.insert(0, text_x_min)
|
||||
cols.append(text_x_max)
|
||||
cols = [(cols[i], cols[i + 1]) for i in range(0, len(cols) - 1)]
|
||||
if user_cols is not None:
|
||||
cols = [text_x_min] + user_cols + [text_x_max]
|
||||
cols = [
|
||||
(cols[i], cols[i + 1])
|
||||
for i in range(0, len(cols) - 1)
|
||||
]
|
||||
else:
|
||||
parse_details = self.table_bbox[bbox]
|
||||
parse_details = self.table_bbox_parses[bbox]
|
||||
col_anchors = parse_details["cols_anchors"]
|
||||
cols = list(map(
|
||||
lambda idx: [col_anchors[idx], col_anchors[idx + 1]],
|
||||
|
|
|
|||
|
|
@ -122,14 +122,14 @@ class Stream(TextBaseParser):
|
|||
self.horizontal_text)
|
||||
hor_text.extend(region_text)
|
||||
# find tables based on nurminen's detection algorithm
|
||||
table_bbox = self._nurminen_table_detection(hor_text)
|
||||
table_bbox_parses = self._nurminen_table_detection(hor_text)
|
||||
else:
|
||||
table_bbox = {}
|
||||
table_bbox_parses = {}
|
||||
for area_str in self.table_areas:
|
||||
table_bbox[bbox_from_str(area_str)] = None
|
||||
self.table_bbox = table_bbox
|
||||
table_bbox_parses[bbox_from_str(area_str)] = None
|
||||
self.table_bbox_parses = table_bbox_parses
|
||||
|
||||
def _generate_columns_and_rows(self, bbox, table_idx):
|
||||
def _generate_columns_and_rows(self, bbox, user_cols):
|
||||
# select elements which lie within table_bbox
|
||||
self.t_bbox = text_in_bbox_per_axis(
|
||||
bbox,
|
||||
|
|
@ -140,26 +140,18 @@ class Stream(TextBaseParser):
|
|||
text_x_min, text_y_min, text_x_max, text_y_max = bbox_from_textlines(
|
||||
self.t_bbox["horizontal"] + self.t_bbox["vertical"]
|
||||
)
|
||||
# FRHTODO:
|
||||
# This algorithm takes the horizontal textlines in the bbox, and groups
|
||||
# them into rows based on their bottom y0.
|
||||
# That's wrong: it misses the vertical items, and misses out on all
|
||||
# the alignment identification work we've done earlier.
|
||||
|
||||
rows_grouped = self._group_rows(
|
||||
self.t_bbox["horizontal"], row_tol=self.row_tol)
|
||||
rows = self._join_rows(rows_grouped, text_y_max, text_y_min)
|
||||
elements = [len(r) for r in rows_grouped]
|
||||
|
||||
if self.columns is not None and self.columns[table_idx] != "":
|
||||
# user has to input boundary columns too
|
||||
# take (0, pdf_width) by default
|
||||
# similar to else condition
|
||||
# len can't be 1
|
||||
cols = self.columns[table_idx].split(",")
|
||||
cols = [float(c) for c in cols]
|
||||
cols.insert(0, text_x_min)
|
||||
cols.append(text_x_max)
|
||||
cols = [(cols[i], cols[i + 1]) for i in range(0, len(cols) - 1)]
|
||||
if user_cols is not None:
|
||||
cols = [text_x_min] + user_cols + [text_x_max]
|
||||
cols = [
|
||||
(cols[i], cols[i + 1])
|
||||
for i in range(0, len(cols) - 1)
|
||||
]
|
||||
else:
|
||||
# calculate mode of the list of number of elements in
|
||||
# each row to guess the number of columns
|
||||
|
|
@ -175,8 +167,8 @@ class Stream(TextBaseParser):
|
|||
ncols = max(set(elements), key=elements.count)
|
||||
else:
|
||||
warnings.warn(
|
||||
"No tables found in table area {}"
|
||||
.format(table_idx + 1)
|
||||
"No tables found in table area {bbox}".format(
|
||||
bbox=bbox)
|
||||
)
|
||||
cols = [
|
||||
(t.x0, t.x1)
|
||||
|
|
|
|||
|
|
@ -74,7 +74,7 @@ def draw_labeled_bbox(
|
|||
)
|
||||
|
||||
|
||||
def draw_pdf(table, ax, to_pdf_scale=True):
|
||||
def draw_pdf(table, ax):
|
||||
"""Draw the content of the table's source pdf into the passed subplot
|
||||
|
||||
Parameters
|
||||
|
|
@ -83,14 +83,9 @@ def draw_pdf(table, ax, to_pdf_scale=True):
|
|||
|
||||
ax : matplotlib.axes.Axes (optional)
|
||||
|
||||
to_pdf_scale : bool (optional)
|
||||
|
||||
"""
|
||||
img = table.get_pdf_image()
|
||||
if to_pdf_scale:
|
||||
ax.imshow(img, extent=(0, table.pdf_size[0], 0, table.pdf_size[1]))
|
||||
else:
|
||||
ax.imshow(img)
|
||||
ax.imshow(img, extent=(0, table.pdf_size[0], 0, table.pdf_size[1]))
|
||||
|
||||
|
||||
def draw_parse_constraints(table, ax):
|
||||
|
|
@ -132,8 +127,6 @@ def draw_text(table, ax):
|
|||
table : camelot.core.Table
|
||||
ax : matplotlib.axes.Axes (optional)
|
||||
|
||||
ax : matplotlib.axes.Axes
|
||||
|
||||
"""
|
||||
bbox = bbox_from_textlines(table.textlines)
|
||||
for t in table.textlines:
|
||||
|
|
@ -150,18 +143,14 @@ def draw_text(table, ax):
|
|||
extend_axe_lim(ax, bbox)
|
||||
|
||||
|
||||
def prepare_plot(table, ax=None, to_pdf_scale=True):
|
||||
def prepare_plot(table, ax=None):
|
||||
"""Initialize plot and draw common components
|
||||
|
||||
Parameters
|
||||
----------
|
||||
table : camelot.core.Table
|
||||
|
||||
ax : matplotlib.axes.Axes (optional)
|
||||
to_pdf_scale :
|
||||
|
||||
ax : matplotlib.axes.Axes
|
||||
|
||||
to_pdf_scale : bool (optional)
|
||||
|
||||
Returns
|
||||
-------
|
||||
|
|
@ -170,7 +159,7 @@ def prepare_plot(table, ax=None, to_pdf_scale=True):
|
|||
if ax is None:
|
||||
fig = plt.figure()
|
||||
ax = fig.add_subplot(111, aspect="equal")
|
||||
draw_pdf(table, ax, to_pdf_scale)
|
||||
draw_pdf(table, ax)
|
||||
draw_parse_constraints(table, ax)
|
||||
return ax
|
||||
|
||||
|
|
@ -186,7 +175,8 @@ class PlotMethods():
|
|||
table: camelot.core.Table
|
||||
A Camelot Table.
|
||||
kind : str, optional (default: 'text')
|
||||
{'text', 'grid', 'contour', 'joint', 'line'}
|
||||
{'text', 'grid', 'contour', 'joint', 'line',
|
||||
'network_table_search'}
|
||||
The element type for which a plot should be generated.
|
||||
filepath: str, optional (default: None)
|
||||
Absolute path for saving the generated plot.
|
||||
|
|
@ -203,9 +193,12 @@ class PlotMethods():
|
|||
raise NotImplementedError(
|
||||
"Lattice flavor does not support kind='{}'".format(kind)
|
||||
)
|
||||
if table.flavor in ["stream", "network"] and kind in ["line"]:
|
||||
if table.flavor != "lattice" and kind in ["line"]:
|
||||
raise NotImplementedError(
|
||||
"Stream flavor does not support kind='{}'".format(kind)
|
||||
"{flavor} flavor does not support kind='{kind}'".format(
|
||||
flavor=table.flavor,
|
||||
kind=kind
|
||||
)
|
||||
)
|
||||
|
||||
plot_method = getattr(self, kind)
|
||||
|
|
@ -274,25 +267,21 @@ class PlotMethods():
|
|||
|
||||
"""
|
||||
_FOR_LATTICE = table.flavor == "lattice"
|
||||
ax = prepare_plot(table, ax, to_pdf_scale=not _FOR_LATTICE)
|
||||
|
||||
if _FOR_LATTICE:
|
||||
table_bbox = table._bbox_unscaled
|
||||
else:
|
||||
table_bbox = {table._bbox: None}
|
||||
ax = prepare_plot(table, ax)
|
||||
|
||||
if not _FOR_LATTICE:
|
||||
draw_text(table, ax)
|
||||
|
||||
for t in table_bbox.keys():
|
||||
ax.add_patch(
|
||||
patches.Rectangle(
|
||||
(t[0], t[1]), t[2] - t[0], t[3] - t[1],
|
||||
fill=False, color="red"
|
||||
)
|
||||
ax.add_patch(
|
||||
patches.Rectangle(
|
||||
(table._bbox[0], table._bbox[1]),
|
||||
table._bbox[2] - table._bbox[0],
|
||||
table._bbox[3] - table._bbox[1],
|
||||
fill=False, color="red"
|
||||
)
|
||||
if not _FOR_LATTICE:
|
||||
extend_axe_lim(ax, t)
|
||||
)
|
||||
if not _FOR_LATTICE:
|
||||
extend_axe_lim(ax, table._bbox)
|
||||
|
||||
return ax.get_figure()
|
||||
|
||||
|
|
@ -393,14 +382,12 @@ class PlotMethods():
|
|||
fig : matplotlib.fig.Figure
|
||||
|
||||
"""
|
||||
ax = prepare_plot(table, ax, to_pdf_scale=False)
|
||||
table_bbox = table._bbox_unscaled
|
||||
ax = prepare_plot(table, ax)
|
||||
x_coord = []
|
||||
y_coord = []
|
||||
for k in table_bbox.keys():
|
||||
for coord in table_bbox[k]:
|
||||
x_coord.append(coord[0])
|
||||
y_coord.append(coord[1])
|
||||
for coord in table.parse["joints"]:
|
||||
x_coord.append(coord[0])
|
||||
y_coord.append(coord[1])
|
||||
ax.plot(x_coord, y_coord, "ro")
|
||||
return ax.get_figure()
|
||||
|
||||
|
|
|
|||
108
camelot/utils.py
108
camelot/utils.py
|
|
@ -297,8 +297,9 @@ def scale_image(tables, v_segments, h_segments, factors):
|
|||
j_x, j_y = zip(*tables[k])
|
||||
j_x = [scale(j, scaling_factor_x) for j in j_x]
|
||||
j_y = [scale(abs(translate(-img_y, j)), scaling_factor_y) for j in j_y]
|
||||
joints = zip(j_x, j_y)
|
||||
tables_new[(x1, y1, x2, y2)] = joints
|
||||
tables_new[(x1, y1, x2, y2)] = {
|
||||
"joints": list(zip(j_x, j_y))
|
||||
}
|
||||
|
||||
v_segments_new = []
|
||||
for v in v_segments:
|
||||
|
|
@ -434,6 +435,16 @@ def bbox_from_str(bbox_str):
|
|||
)
|
||||
|
||||
|
||||
def bboxes_overlap(bbox1, bbox2):
|
||||
(left1, bottom1, right1, top1) = bbox1
|
||||
(left2, bottom2, right2, top2) = bbox2
|
||||
return (
|
||||
(left1 < left2 < right1) or (left1 < right2 < right1)
|
||||
) and (
|
||||
(bottom1 < bottom2 < top1) or (bottom1 < top2 < top1)
|
||||
)
|
||||
|
||||
|
||||
def textlines_overlapping_bbox(bbox, textlines):
|
||||
"""Returns all text objects which overlap or are within a bounding box.
|
||||
|
||||
|
|
@ -451,12 +462,10 @@ def textlines_overlapping_bbox(bbox, textlines):
|
|||
List of PDFMiner text objects.
|
||||
|
||||
"""
|
||||
(left, bottom, right, top) = bbox
|
||||
t_bbox = [
|
||||
t
|
||||
for t in textlines
|
||||
if ((left < t.x0 < right) or (left < t.x1 < right))
|
||||
and ((bottom < t.y0 < top) or (bottom < t.y1 < top))
|
||||
if bboxes_overlap(bbox, (t.x0, t.y0, t.x1, t.y1))
|
||||
]
|
||||
return t_bbox
|
||||
|
||||
|
|
@ -560,27 +569,25 @@ def bbox_from_textlines(textlines):
|
|||
return bbox
|
||||
|
||||
|
||||
def find_columns_coordinates(tls, min_gap=1.0):
|
||||
"""Given a list of text objects, guess columns boundaries and returns a
|
||||
list of x-coordinates for split points between columns.
|
||||
def find_columns_boundaries(tls, min_gap=1.0):
|
||||
"""Make a list of disjunct cols boundaries for a list of text objects
|
||||
|
||||
Parameters
|
||||
----------
|
||||
tls : list of PDFMiner text object.
|
||||
|
||||
min_gap : minimum distance between columns. Any elements closer than this
|
||||
threshold are merged together. This is to prevent spaces between words
|
||||
to be misinterpreted as column boundaries.
|
||||
min_gap : minimum distance between columns. Any elements closer than
|
||||
this threshold are merged together. This is to prevent spaces between
|
||||
words to be misinterpreted as boundaries.
|
||||
|
||||
Returns
|
||||
-------
|
||||
cols_anchors : list
|
||||
List of x-coordinates for columns.
|
||||
boundaries : list
|
||||
List x-coordinates for cols.
|
||||
[(1st col left, 1st col right), (2nd col left, 2nd col right), ...]
|
||||
|
||||
|
||||
"""
|
||||
# Make a list of disjunct cols boundaries across the textlines
|
||||
# that comprise the table.
|
||||
# [(1st col left, 1st col right), (2nd col left, 2nd col right), ...]
|
||||
cols_bounds = []
|
||||
tls.sort(key=lambda tl: tl.x0)
|
||||
for tl in tls:
|
||||
|
|
@ -588,18 +595,64 @@ def find_columns_coordinates(tls, min_gap=1.0):
|
|||
cols_bounds.append([tl.x0, tl.x1])
|
||||
else:
|
||||
cols_bounds[-1][1] = max(cols_bounds[-1][1], tl.x1)
|
||||
return cols_bounds
|
||||
|
||||
|
||||
def find_rows_boundaries(tls, min_gap=1.0):
|
||||
"""Make a list of disjunct rows boundaries for a list of text objects
|
||||
|
||||
Parameters
|
||||
----------
|
||||
tls : list of PDFMiner text object.
|
||||
|
||||
min_gap : minimum distance between rows. Any elements closer than
|
||||
this threshold are merged together.
|
||||
|
||||
Returns
|
||||
-------
|
||||
boundaries : list
|
||||
List y-coordinates for rows.
|
||||
[(1st row bottom, 1st row top), (2nd row bottom, 2nd row top), ...]
|
||||
|
||||
"""
|
||||
rows_bounds = []
|
||||
tls.sort(key=lambda tl: tl.y0)
|
||||
for tl in tls:
|
||||
if (not rows_bounds) or rows_bounds[-1][1] + min_gap < tl.y0:
|
||||
rows_bounds.append([tl.y0, tl.y1])
|
||||
else:
|
||||
rows_bounds[-1][1] = max(rows_bounds[-1][1], tl.y1)
|
||||
return rows_bounds
|
||||
|
||||
|
||||
def boundaries_to_split_lines(boundaries):
|
||||
"""Find split lines given a list of boundaries between rows or cols.
|
||||
|
||||
Boundaries: [ a ] [b] [ c ] [d]
|
||||
Splits: | | | | |
|
||||
|
||||
Parameters
|
||||
----------
|
||||
boundaries : list
|
||||
List of tuples of x- (for columns) or y- (for rows) coord boundaries.
|
||||
These are the (left, right most) or (bottom, top most) coordinates.
|
||||
|
||||
Returns
|
||||
-------
|
||||
anchors : list
|
||||
List of coordinates representing the split points, each half way
|
||||
between boundaries
|
||||
|
||||
"""
|
||||
# From the row boundaries, identify splits by getting the mid points
|
||||
# between the boundaries.
|
||||
# Row boundaries: [ a ] [b] [ c ]
|
||||
# Splits: | | | |
|
||||
cols_anchors = list(map(
|
||||
lambda idx: (cols_bounds[idx-1][1] + cols_bounds[idx][0]) / 2.0,
|
||||
range(1, len(cols_bounds))
|
||||
anchors = list(map(
|
||||
lambda idx: (boundaries[idx-1][1] + boundaries[idx][0]) / 2.0,
|
||||
range(1, len(boundaries))
|
||||
))
|
||||
cols_anchors.insert(0, cols_bounds[0][0])
|
||||
cols_anchors.append(cols_bounds[-1][1])
|
||||
return cols_anchors
|
||||
anchors.insert(0, boundaries[0][0])
|
||||
anchors.append(boundaries[-1][1])
|
||||
return anchors
|
||||
|
||||
|
||||
def get_index_closest_point(point, sorted_list, fn=lambda x: x):
|
||||
|
|
@ -1129,17 +1182,20 @@ def get_text_objects(layout, ltype="char", t=None):
|
|||
return t
|
||||
|
||||
|
||||
def export_pdf_as_png(pdf_path, destination_path):
|
||||
def export_pdf_as_png(pdf_path, destination_path, resolution=300):
|
||||
"""Generate an image from a pdf.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
pdf_path : str
|
||||
destination_path : str
|
||||
resolution : int
|
||||
"""
|
||||
gs_call = "-q -sDEVICE=png16m -o {destination_path} -r300 {pdf_path}"\
|
||||
gs_call = "-q -sDEVICE=png16m -o " \
|
||||
"{destination_path} -r{resolution} {pdf_path}" \
|
||||
.format(
|
||||
destination_path=destination_path,
|
||||
resolution=resolution,
|
||||
pdf_path=pdf_path
|
||||
)
|
||||
gs_call = gs_call.encode().split()
|
||||
|
|
|
|||
File diff suppressed because one or more lines are too long
496
tests/data.py
496
tests/data.py
|
|
@ -2074,6 +2074,502 @@ data_network_vertical_headers = [
|
|||
],
|
||||
]
|
||||
|
||||
# Compared to network, hybrid detects additional sparse columns
|
||||
data_hybrid_vertical_headers = [
|
||||
[
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
"Congress-",
|
||||
"",
|
||||
"",
|
||||
"Senator 36th",
|
||||
"",
|
||||
"Rep106th",
|
||||
"",
|
||||
"Reg. of",
|
||||
"",
|
||||
"Road",
|
||||
"",
|
||||
"",
|
||||
"Distri",
|
||||
"Dist",
|
||||
"",
|
||||
"",
|
||||
"Dist",
|
||||
],
|
||||
[
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
"1st Dist",
|
||||
"Dist.",
|
||||
"",
|
||||
"",
|
||||
"Dist.",
|
||||
"Deeds",
|
||||
"",
|
||||
"Commission",
|
||||
"",
|
||||
"District #1",
|
||||
"",
|
||||
"ct #2",
|
||||
"#3",
|
||||
"Dist #4",
|
||||
"",
|
||||
"#5",
|
||||
],
|
||||
[
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
"Governor",
|
||||
"",
|
||||
"",
|
||||
"U.S. Senator",
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
],
|
||||
[
|
||||
"",
|
||||
"Number of Registered voters",
|
||||
"Poll Book Totals",
|
||||
"Brian Calley",
|
||||
"Patrick Colbeck",
|
||||
"Jim Hines",
|
||||
"Bill Schuette",
|
||||
"John James",
|
||||
"Sandy Pensler",
|
||||
"",
|
||||
"Jack Bergman",
|
||||
"",
|
||||
"Jim Stamas",
|
||||
"",
|
||||
"Sue Allor",
|
||||
"",
|
||||
"Melissa A. Cordes",
|
||||
"",
|
||||
"Al Scully",
|
||||
"",
|
||||
"Daniel G. Gauthier",
|
||||
"Craig M. Clemens",
|
||||
"Craig Johnston",
|
||||
"Carolyn Brummund",
|
||||
"Adam Brege",
|
||||
"David Bielusiak",
|
||||
"",
|
||||
],
|
||||
[
|
||||
"Alcona",
|
||||
"963",
|
||||
"439",
|
||||
"55",
|
||||
"26",
|
||||
"47",
|
||||
"164",
|
||||
"173",
|
||||
"111",
|
||||
"",
|
||||
"268",
|
||||
"",
|
||||
"272",
|
||||
"",
|
||||
"275",
|
||||
"",
|
||||
"269",
|
||||
"",
|
||||
"271",
|
||||
"",
|
||||
"224",
|
||||
"76",
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
],
|
||||
[
|
||||
"Caledonia",
|
||||
"923",
|
||||
"393",
|
||||
"40",
|
||||
"23",
|
||||
"45",
|
||||
"158",
|
||||
"150",
|
||||
"103",
|
||||
"",
|
||||
"244",
|
||||
"",
|
||||
"247",
|
||||
"",
|
||||
"254",
|
||||
"",
|
||||
"255",
|
||||
"",
|
||||
"244",
|
||||
"",
|
||||
"139",
|
||||
"143",
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
],
|
||||
[
|
||||
"Curtis",
|
||||
"1026",
|
||||
"349",
|
||||
"30",
|
||||
"30",
|
||||
"25",
|
||||
"102",
|
||||
"95",
|
||||
"84",
|
||||
"",
|
||||
"159",
|
||||
"",
|
||||
"164",
|
||||
"",
|
||||
"162",
|
||||
"",
|
||||
"161",
|
||||
"",
|
||||
"157",
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
],
|
||||
[
|
||||
"Greenbush",
|
||||
"1212",
|
||||
"423",
|
||||
"56",
|
||||
"26",
|
||||
"40",
|
||||
"126",
|
||||
"104",
|
||||
"131",
|
||||
"",
|
||||
"208",
|
||||
"",
|
||||
"213",
|
||||
"",
|
||||
"214",
|
||||
"",
|
||||
"215",
|
||||
"",
|
||||
"208",
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
"208",
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
],
|
||||
[
|
||||
"Gustin",
|
||||
"611",
|
||||
"180",
|
||||
"22",
|
||||
"35",
|
||||
"17",
|
||||
"55",
|
||||
"73",
|
||||
"45",
|
||||
"",
|
||||
"108",
|
||||
"",
|
||||
"104",
|
||||
"",
|
||||
"111",
|
||||
"",
|
||||
"111",
|
||||
"",
|
||||
"109",
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
"81",
|
||||
"42",
|
||||
"",
|
||||
],
|
||||
[
|
||||
"Harrisville",
|
||||
"1142",
|
||||
"430",
|
||||
"45",
|
||||
"90",
|
||||
"29",
|
||||
"101",
|
||||
"155",
|
||||
"94",
|
||||
"",
|
||||
"226",
|
||||
"",
|
||||
"226",
|
||||
"",
|
||||
"232",
|
||||
"",
|
||||
"244",
|
||||
"",
|
||||
"226",
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
"232",
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
],
|
||||
[
|
||||
"Hawes",
|
||||
"884",
|
||||
"293",
|
||||
"38",
|
||||
"36",
|
||||
"27",
|
||||
"109",
|
||||
"121",
|
||||
"84",
|
||||
"",
|
||||
"192",
|
||||
"",
|
||||
"195",
|
||||
"",
|
||||
"195",
|
||||
"",
|
||||
"193",
|
||||
"",
|
||||
"184",
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
"118",
|
||||
"87",
|
||||
"",
|
||||
],
|
||||
[
|
||||
"Haynes",
|
||||
"626",
|
||||
"275",
|
||||
"31",
|
||||
"20",
|
||||
"32",
|
||||
"104",
|
||||
"121",
|
||||
"53",
|
||||
"",
|
||||
"163",
|
||||
"",
|
||||
"163",
|
||||
"",
|
||||
"173",
|
||||
"",
|
||||
"161",
|
||||
"",
|
||||
"152",
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
"76",
|
||||
"",
|
||||
"69",
|
||||
"31",
|
||||
"",
|
||||
],
|
||||
[
|
||||
"Mikado",
|
||||
"781",
|
||||
"208",
|
||||
"19",
|
||||
"39",
|
||||
"17",
|
||||
"81",
|
||||
"90",
|
||||
"63",
|
||||
"",
|
||||
"149",
|
||||
"",
|
||||
"149",
|
||||
"",
|
||||
"145",
|
||||
"",
|
||||
"147",
|
||||
"",
|
||||
"143",
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
"113",
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
],
|
||||
[
|
||||
"Millen",
|
||||
"353",
|
||||
"139",
|
||||
"7",
|
||||
"16",
|
||||
"13",
|
||||
"38",
|
||||
"49",
|
||||
"19",
|
||||
"",
|
||||
"62",
|
||||
"",
|
||||
"66",
|
||||
"",
|
||||
"67",
|
||||
"",
|
||||
"66",
|
||||
"",
|
||||
"62",
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
],
|
||||
[
|
||||
"Mitchell",
|
||||
"327",
|
||||
"96",
|
||||
"12",
|
||||
"17",
|
||||
"7",
|
||||
"29",
|
||||
"41",
|
||||
"17",
|
||||
"",
|
||||
"57",
|
||||
"",
|
||||
"55",
|
||||
"",
|
||||
"57",
|
||||
"",
|
||||
"60",
|
||||
"",
|
||||
"56",
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
],
|
||||
[
|
||||
"City Harrisville",
|
||||
"389",
|
||||
"171",
|
||||
"16",
|
||||
"15",
|
||||
"18",
|
||||
"35",
|
||||
"49",
|
||||
"31",
|
||||
"",
|
||||
"78",
|
||||
"",
|
||||
"80",
|
||||
"",
|
||||
"82",
|
||||
"",
|
||||
"81",
|
||||
"",
|
||||
"77",
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
"73",
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
],
|
||||
[
|
||||
"Totals",
|
||||
"9237",
|
||||
"3396",
|
||||
"371",
|
||||
"373",
|
||||
"317",
|
||||
"1102",
|
||||
"1221",
|
||||
"835",
|
||||
"0",
|
||||
"1914",
|
||||
"0",
|
||||
"1934",
|
||||
"",
|
||||
"1967",
|
||||
"",
|
||||
"1963",
|
||||
"0",
|
||||
"1889",
|
||||
"0",
|
||||
"363",
|
||||
"219",
|
||||
"381",
|
||||
"321",
|
||||
"268",
|
||||
"160",
|
||||
"0",
|
||||
],
|
||||
]
|
||||
|
||||
|
||||
data_stream_table_areas = [
|
||||
|
|
|
|||
Binary file not shown.
|
Before Width: | Height: | Size: 33 KiB After Width: | Height: | Size: 46 KiB |
|
|
@ -291,6 +291,19 @@ def test_network_layout_kwargs():
|
|||
assert_frame_equal(df, tables[0].df)
|
||||
|
||||
|
||||
# Hybrid parser
|
||||
def test_hybrid_vertical_header():
|
||||
"""Tests a complex table with a vertically text header.
|
||||
"""
|
||||
df = pd.DataFrame(data_hybrid_vertical_headers)
|
||||
|
||||
filename = os.path.join(testdir, "vertical_header.pdf")
|
||||
tables = camelot.read_pdf(filename, flavor="hybrid")
|
||||
assert len(tables) == 1
|
||||
assert_frame_equal(df, tables[0].df)
|
||||
|
||||
|
||||
# Lattice parser tests
|
||||
def test_lattice():
|
||||
df = pd.DataFrame(data_lattice)
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue