Improve hybrid plotting

* plot info passed through debug_info
* display each text edge
pull/153/head
Frh 2020-04-20 16:54:06 -07:00
parent e0e3ff4e07
commit 1ccaa0630d
9 changed files with 118 additions and 70 deletions

View File

@ -40,10 +40,13 @@ class PDFHandler():
Example: '1,3,4' or '1,4-end' or 'all'.
password : str, optional (default: None)
Password for decryption.
debug : bool, optional (default: False)
Whether the parser should store debug information during parsing.
"""
def __init__(self, filepath, pages="1", password=None):
def __init__(self, filepath, pages="1", password=None, debug=False):
self.debug = debug
if is_url(filepath):
filepath = download_url(filepath)
self.filepath = filepath
@ -193,7 +196,7 @@ class PDFHandler():
tables = []
parser_obj = PARSERS[flavor]
parser = parser_obj(**kwargs)
parser = parser_obj(debug=self.debug, **kwargs)
# Read the layouts/dimensions of each of the pages we need to
# parse. This might require creating a temporary .pdf.
@ -204,8 +207,8 @@ class PDFHandler():
)
parser.prepare_page_parse(source_file, layout, dimensions,
page_idx, layout_kwargs)
rootname = os.path.basename(parser.rootname)
if not suppress_stdout:
rootname = os.path.basename(parser.rootname)
logger.info(
"Processing {rootname}".format(rootname=rootname))
t = parser.extract_tables()

View File

@ -13,6 +13,7 @@ def read_pdf(
flavor="lattice",
suppress_stdout=False,
layout_kwargs=None,
debug=False,
**kwargs
):
"""Read PDF and return extracted tables.
@ -110,7 +111,7 @@ def read_pdf(
warnings.simplefilter("ignore")
validate_input(kwargs, flavor=flavor)
p = PDFHandler(filepath, pages=pages, password=password)
p = PDFHandler(filepath, pages=pages, password=password, debug=debug)
kwargs = remove_extra(kwargs, flavor=flavor)
tables = p.parse(
flavor=flavor,

View File

@ -5,7 +5,9 @@ import warnings
from ..utils import (
get_text_objects,
get_table_index
get_table_index,
text_in_bbox,
bbox_from_str,
)
from ..core import Table
@ -65,7 +67,39 @@ class BaseParser(object):
self.debug_info["table_regions"] = self.table_regions
self.debug_info["table_areas"] = self.table_areas
def _apply_regions_filter(self, textlines):
"""If regions have been specified, filter textlines to these regions.
Parameters
----------
textlines : list
list of textlines to be filtered
Returns
-------
filtered_textlines : list of textlines within the regions specified
"""
filtered_textlines = []
if self.table_regions is None:
filtered_textlines.extend(textlines)
else:
for region_str in self.table_regions:
region_text = text_in_bbox(
bbox_from_str(region_str),
textlines
)
filtered_textlines.extend(region_text)
return filtered_textlines
def _document_has_no_text(self):
"""Detects image only documents and warns.
Returns
-------
has_no_text : bool
Whether the document doesn't have any text at all.
"""
if not self.horizontal_text:
rootname = os.path.basename(self.rootname)
if self.images:
@ -81,6 +115,7 @@ class BaseParser(object):
return True
return False
def _initialize_new_table(self, table_idx, cols, rows):
"""Initialize new table object, ready to be populated
Parameters
@ -97,7 +132,6 @@ class BaseParser(object):
table : camelot.core.Table
"""
def _initialize_new_table(self, table_idx, cols, rows):
table = Table(cols, rows)
table.page = self.page
table.order = table_idx + 1

View File

@ -2,7 +2,7 @@
from __future__ import division
import numpy as np
import copy
import warnings
from .base import BaseParser
@ -459,7 +459,6 @@ class TextEdges2(object):
or horizontally. There needs to be connections across both
dimensions.
"""
singleton_textlines = []
removed_singletons = True
while removed_singletons:
removed_singletons = False
@ -471,7 +470,6 @@ class TextEdges2(object):
tl = te.textlines[i]
alignments = self._textlines_alignments[tl]
if alignments.max_h() <= 1 or alignments.max_v() <= 1:
singleton_textlines.append(tl)
del te.textlines[i]
removed_singletons = True
self._textlines_alignments = {}
@ -612,33 +610,27 @@ class TextEdges2(object):
self._register_all_text_lines(textlines)
self._compute_alignment_counts()
def plotFRHAlignments(self, table, plt):
def plot_alignments(self, ax):
"""Displays a visualization of the alignments as currently computed.
"""
fig = plt.figure()
ax = fig.add_subplot(111, aspect="equal")
img = table.get_pdf_image()
ax.imshow(img, extent=(0, table.pdf_size[0], 0, table.pdf_size[1]))
tls_by_alignment_score = sorted(
# FRHTODO: This is too busy and doesn't plot lines
most_aligned_tl = sorted(
self._textlines_alignments.keys(),
key=lambda textline:
self._textlines_alignments[textline].alignment_score(),
reverse=True
)
)[0]
for tl, alignments in self._textlines_alignments.items():
color = "red"
if tl == tls_by_alignment_score[0]:
color = "blue"
ax.add_patch(
patches.Rectangle(
(tl.x0, tl.y0),
tl.x1 - tl.x0, tl.y1 - tl.y0,
color=color,
(most_aligned_tl.x0, most_aligned_tl.y0),
most_aligned_tl.x1 - most_aligned_tl.x0,
most_aligned_tl.y1 - most_aligned_tl.y0,
color="red",
alpha=0.5
)
)
for tl, alignments in self._textlines_alignments.items():
ax.text(
tl.x0 - 5,
tl.y0 - 5,
@ -749,6 +741,7 @@ class Hybrid(BaseParser):
edge_tol=50,
row_tol=2,
column_tol=0,
debug=False,
**kwargs
):
super().__init__(
@ -758,6 +751,7 @@ class Hybrid(BaseParser):
split_text=split_text,
strip_text=strip_text,
flag_size=flag_size,
debug=debug
)
self.columns = columns
self._validate_columns()
@ -971,8 +965,7 @@ class Hybrid(BaseParser):
raise ValueError("Length of table_areas and columns"
" should be equal")
# FRHTODO: get debug_info to work again
def _generate_table_bbox(self, debug_info=None):
def _generate_table_bbox(self):
if self.table_areas is not None:
table_bbox = {}
for area_str in self.table_areas:
@ -981,32 +974,30 @@ class Hybrid(BaseParser):
return
all_textlines = self.horizontal_text + self.vertical_text
textlines = []
if self.table_regions is None:
textlines = all_textlines
else:
# filter text
for region_str in self.table_regions:
region_text = text_in_bbox(
bbox_from_str(region_str),
all_textlines
)
textlines.extend(region_text)
textlines = self._apply_regions_filter(all_textlines)
textlines_processed = {}
self.table_bbox = {}
if debug_info is not None:
debug_info_bbox_searches = []
debug_info["bboxes_searches"] = debug_info_bbox_searches
if self.debug_info is not None:
debug_info_edges_searches = []
self.debug_info["edges_searches"] = debug_info_edges_searches
debug_info_bboxes_searches = []
self.debug_info["bboxes_searches"] = debug_info_bboxes_searches
else:
debug_info_bbox_searches = None
debug_info_edges_searches = None
debug_info_bboxes_searches = None
while True:
self.textedges = TextEdges2()
self.textedges.generate(textlines)
self.textedges._remove_unconnected_edges()
if debug_info_edges_searches is not None:
# Preserve the current edge calculation for display debugging
debug_info_edges_searches.append(
copy.deepcopy(self.textedges)
)
bbox = self.textedges._build_bbox_candidate(
debug_info_bbox_searches
debug_info_bboxes_searches
)
if bbox is None:
break
@ -1040,8 +1031,10 @@ class Hybrid(BaseParser):
average_tl_height
)
if debug_info is not None:
debug_info["col_searches"].append({
if self.debug_info is not None:
if "col_searches" not in self.debug_info:
self.debug_info["col_searches"] = []
self.debug_info["col_searches"].append({
"core_bbox": bbox,
"cols_anchors": cols_anchors,
"expanded_bbox": expanded_bbox
@ -1148,13 +1141,13 @@ class Hybrid(BaseParser):
return table
def extract_tables(self, debug_info=None):
def extract_tables(self):
if self._document_has_no_text():
return []
# Identify plausible areas within the doc where tables lie,
# populate table_bbox keys with these areas.
self._generate_table_bbox(debug_info)
self._generate_table_bbox()
_tables = []
# sort tables based on y-coord

View File

@ -164,6 +164,20 @@ class PlotMethods(object):
ax.imshow(img, extent=(0, table.pdf_size[0], 0, table.pdf_size[1]))
return fig
@staticmethod
def draw_pdf(table, ax):
"""Draw the content of the table's source pdf into the passed subplot
Parameters
----------
table : camelot.core.Table
fig : matplotlib.axes.Axes
"""
img = table.get_pdf_image()
ax.imshow(img, extent=(0, table.pdf_size[0], 0, table.pdf_size[1]))
@staticmethod
def textedge(table):
"""Generates a plot for relevant textedges.
@ -179,6 +193,7 @@ class PlotMethods(object):
"""
fig = plt.figure()
ax = fig.add_subplot(111, aspect="equal")
PlotMethods.draw_pdf(table, ax)
xs, ys = [], []
for t in table._text:
xs.extend([t[0], t[2]])
@ -193,11 +208,13 @@ class PlotMethods(object):
ax.set_xlim(min(xs) - 10, max(xs) + 10)
ax.set_ylim(min(ys) - 10, max(ys) + 10)
if table.flavor == "hybrid":
# FRHTODO: Clean this up
table.debug_info["edges_searches"][0].plot_alignments(ax)
else:
for te in table._textedges:
ax.plot([te.x, te.x], [te.y0, te.y1])
img = table.get_pdf_image()
ax.imshow(img, extent=(0, table.pdf_size[0], 0, table.pdf_size[1]))
return fig
@staticmethod

Binary file not shown.

Before

Width:  |  Height:  |  Size: 15 KiB

After

Width:  |  Height:  |  Size: 16 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 49 KiB

After

Width:  |  Height:  |  Size: 48 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 113 KiB

After

Width:  |  Height:  |  Size: 197 KiB

View File

@ -129,5 +129,5 @@ def test_stream_textedge_plot():
baseline_dir="files/baseline_plots", remove_text=True)
def test_hybrid_textedge_plot():
filename = os.path.join(testdir, "tabula/12s0324.pdf")
tables = camelot.read_pdf(filename, flavor='hybrid')
tables = camelot.read_pdf(filename, debug=True, flavor='hybrid')
return camelot.plot(tables[0], kind='textedge')