Improve hybrid plotting

* plot info passed through debug_info
* display each text edge
pull/153/head
Frh 2020-04-20 16:54:06 -07:00
parent e0e3ff4e07
commit 1ccaa0630d
9 changed files with 118 additions and 70 deletions

View File

@ -40,10 +40,13 @@ class PDFHandler():
Example: '1,3,4' or '1,4-end' or 'all'. Example: '1,3,4' or '1,4-end' or 'all'.
password : str, optional (default: None) password : str, optional (default: None)
Password for decryption. Password for decryption.
debug : bool, optional (default: False)
Whether the parser should store debug information during parsing.
""" """
def __init__(self, filepath, pages="1", password=None): def __init__(self, filepath, pages="1", password=None, debug=False):
self.debug = debug
if is_url(filepath): if is_url(filepath):
filepath = download_url(filepath) filepath = download_url(filepath)
self.filepath = filepath self.filepath = filepath
@ -193,7 +196,7 @@ class PDFHandler():
tables = [] tables = []
parser_obj = PARSERS[flavor] parser_obj = PARSERS[flavor]
parser = parser_obj(**kwargs) parser = parser_obj(debug=self.debug, **kwargs)
# Read the layouts/dimensions of each of the pages we need to # Read the layouts/dimensions of each of the pages we need to
# parse. This might require creating a temporary .pdf. # parse. This might require creating a temporary .pdf.
@ -204,8 +207,8 @@ class PDFHandler():
) )
parser.prepare_page_parse(source_file, layout, dimensions, parser.prepare_page_parse(source_file, layout, dimensions,
page_idx, layout_kwargs) page_idx, layout_kwargs)
rootname = os.path.basename(parser.rootname)
if not suppress_stdout: if not suppress_stdout:
rootname = os.path.basename(parser.rootname)
logger.info( logger.info(
"Processing {rootname}".format(rootname=rootname)) "Processing {rootname}".format(rootname=rootname))
t = parser.extract_tables() t = parser.extract_tables()

View File

@ -13,6 +13,7 @@ def read_pdf(
flavor="lattice", flavor="lattice",
suppress_stdout=False, suppress_stdout=False,
layout_kwargs=None, layout_kwargs=None,
debug=False,
**kwargs **kwargs
): ):
"""Read PDF and return extracted tables. """Read PDF and return extracted tables.
@ -110,7 +111,7 @@ def read_pdf(
warnings.simplefilter("ignore") warnings.simplefilter("ignore")
validate_input(kwargs, flavor=flavor) validate_input(kwargs, flavor=flavor)
p = PDFHandler(filepath, pages=pages, password=password) p = PDFHandler(filepath, pages=pages, password=password, debug=debug)
kwargs = remove_extra(kwargs, flavor=flavor) kwargs = remove_extra(kwargs, flavor=flavor)
tables = p.parse( tables = p.parse(
flavor=flavor, flavor=flavor,

View File

@ -5,7 +5,9 @@ import warnings
from ..utils import ( from ..utils import (
get_text_objects, get_text_objects,
get_table_index get_table_index,
text_in_bbox,
bbox_from_str,
) )
from ..core import Table from ..core import Table
@ -65,7 +67,39 @@ class BaseParser(object):
self.debug_info["table_regions"] = self.table_regions self.debug_info["table_regions"] = self.table_regions
self.debug_info["table_areas"] = self.table_areas self.debug_info["table_areas"] = self.table_areas
def _apply_regions_filter(self, textlines):
"""If regions have been specified, filter textlines to these regions.
Parameters
----------
textlines : list
list of textlines to be filtered
Returns
-------
filtered_textlines : list of textlines within the regions specified
"""
filtered_textlines = []
if self.table_regions is None:
filtered_textlines.extend(textlines)
else:
for region_str in self.table_regions:
region_text = text_in_bbox(
bbox_from_str(region_str),
textlines
)
filtered_textlines.extend(region_text)
return filtered_textlines
def _document_has_no_text(self): def _document_has_no_text(self):
"""Detects image only documents and warns.
Returns
-------
has_no_text : bool
Whether the document doesn't have any text at all.
"""
if not self.horizontal_text: if not self.horizontal_text:
rootname = os.path.basename(self.rootname) rootname = os.path.basename(self.rootname)
if self.images: if self.images:
@ -81,23 +115,23 @@ class BaseParser(object):
return True return True
return False return False
"""Initialize new table object, ready to be populated
Parameters
----------
table_idx : int
Index of this table within the pdf page analyzed
cols : list
list of coordinate boundaries tuples (left, right)
rows : list
list of coordinate boundaries tuples (bottom, top)
Returns
-------
table : camelot.core.Table
"""
def _initialize_new_table(self, table_idx, cols, rows): def _initialize_new_table(self, table_idx, cols, rows):
"""Initialize new table object, ready to be populated
Parameters
----------
table_idx : int
Index of this table within the pdf page analyzed
cols : list
list of coordinate boundaries tuples (left, right)
rows : list
list of coordinate boundaries tuples (bottom, top)
Returns
-------
table : camelot.core.Table
"""
table = Table(cols, rows) table = Table(cols, rows)
table.page = self.page table.page = self.page
table.order = table_idx + 1 table.order = table_idx + 1

View File

@ -2,7 +2,7 @@
from __future__ import division from __future__ import division
import numpy as np import numpy as np
import copy
import warnings import warnings
from .base import BaseParser from .base import BaseParser
@ -459,7 +459,6 @@ class TextEdges2(object):
or horizontally. There needs to be connections across both or horizontally. There needs to be connections across both
dimensions. dimensions.
""" """
singleton_textlines = []
removed_singletons = True removed_singletons = True
while removed_singletons: while removed_singletons:
removed_singletons = False removed_singletons = False
@ -471,7 +470,6 @@ class TextEdges2(object):
tl = te.textlines[i] tl = te.textlines[i]
alignments = self._textlines_alignments[tl] alignments = self._textlines_alignments[tl]
if alignments.max_h() <= 1 or alignments.max_v() <= 1: if alignments.max_h() <= 1 or alignments.max_v() <= 1:
singleton_textlines.append(tl)
del te.textlines[i] del te.textlines[i]
removed_singletons = True removed_singletons = True
self._textlines_alignments = {} self._textlines_alignments = {}
@ -612,33 +610,27 @@ class TextEdges2(object):
self._register_all_text_lines(textlines) self._register_all_text_lines(textlines)
self._compute_alignment_counts() self._compute_alignment_counts()
def plotFRHAlignments(self, table, plt): def plot_alignments(self, ax):
"""Displays a visualization of the alignments as currently computed. """Displays a visualization of the alignments as currently computed.
""" """
fig = plt.figure() # FRHTODO: This is too busy and doesn't plot lines
ax = fig.add_subplot(111, aspect="equal") most_aligned_tl = sorted(
img = table.get_pdf_image()
ax.imshow(img, extent=(0, table.pdf_size[0], 0, table.pdf_size[1]))
tls_by_alignment_score = sorted(
self._textlines_alignments.keys(), self._textlines_alignments.keys(),
key=lambda textline: key=lambda textline:
self._textlines_alignments[textline].alignment_score(), self._textlines_alignments[textline].alignment_score(),
reverse=True reverse=True
) )[0]
for tl, alignments in self._textlines_alignments.items(): ax.add_patch(
color = "red" patches.Rectangle(
if tl == tls_by_alignment_score[0]: (most_aligned_tl.x0, most_aligned_tl.y0),
color = "blue" most_aligned_tl.x1 - most_aligned_tl.x0,
ax.add_patch( most_aligned_tl.y1 - most_aligned_tl.y0,
patches.Rectangle( color="red",
(tl.x0, tl.y0), alpha=0.5
tl.x1 - tl.x0, tl.y1 - tl.y0,
color=color,
alpha=0.5
)
) )
)
for tl, alignments in self._textlines_alignments.items():
ax.text( ax.text(
tl.x0 - 5, tl.x0 - 5,
tl.y0 - 5, tl.y0 - 5,
@ -749,6 +741,7 @@ class Hybrid(BaseParser):
edge_tol=50, edge_tol=50,
row_tol=2, row_tol=2,
column_tol=0, column_tol=0,
debug=False,
**kwargs **kwargs
): ):
super().__init__( super().__init__(
@ -758,6 +751,7 @@ class Hybrid(BaseParser):
split_text=split_text, split_text=split_text,
strip_text=strip_text, strip_text=strip_text,
flag_size=flag_size, flag_size=flag_size,
debug=debug
) )
self.columns = columns self.columns = columns
self._validate_columns() self._validate_columns()
@ -971,8 +965,7 @@ class Hybrid(BaseParser):
raise ValueError("Length of table_areas and columns" raise ValueError("Length of table_areas and columns"
" should be equal") " should be equal")
# FRHTODO: get debug_info to work again def _generate_table_bbox(self):
def _generate_table_bbox(self, debug_info=None):
if self.table_areas is not None: if self.table_areas is not None:
table_bbox = {} table_bbox = {}
for area_str in self.table_areas: for area_str in self.table_areas:
@ -981,32 +974,30 @@ class Hybrid(BaseParser):
return return
all_textlines = self.horizontal_text + self.vertical_text all_textlines = self.horizontal_text + self.vertical_text
textlines = [] textlines = self._apply_regions_filter(all_textlines)
if self.table_regions is None:
textlines = all_textlines
else:
# filter text
for region_str in self.table_regions:
region_text = text_in_bbox(
bbox_from_str(region_str),
all_textlines
)
textlines.extend(region_text)
textlines_processed = {} textlines_processed = {}
self.table_bbox = {} self.table_bbox = {}
if debug_info is not None: if self.debug_info is not None:
debug_info_bbox_searches = [] debug_info_edges_searches = []
debug_info["bboxes_searches"] = debug_info_bbox_searches self.debug_info["edges_searches"] = debug_info_edges_searches
debug_info_bboxes_searches = []
self.debug_info["bboxes_searches"] = debug_info_bboxes_searches
else: else:
debug_info_bbox_searches = None debug_info_edges_searches = None
debug_info_bboxes_searches = None
while True: while True:
self.textedges = TextEdges2() self.textedges = TextEdges2()
self.textedges.generate(textlines) self.textedges.generate(textlines)
self.textedges._remove_unconnected_edges() self.textedges._remove_unconnected_edges()
if debug_info_edges_searches is not None:
# Preserve the current edge calculation for display debugging
debug_info_edges_searches.append(
copy.deepcopy(self.textedges)
)
bbox = self.textedges._build_bbox_candidate( bbox = self.textedges._build_bbox_candidate(
debug_info_bbox_searches debug_info_bboxes_searches
) )
if bbox is None: if bbox is None:
break break
@ -1040,8 +1031,10 @@ class Hybrid(BaseParser):
average_tl_height average_tl_height
) )
if debug_info is not None: if self.debug_info is not None:
debug_info["col_searches"].append({ if "col_searches" not in self.debug_info:
self.debug_info["col_searches"] = []
self.debug_info["col_searches"].append({
"core_bbox": bbox, "core_bbox": bbox,
"cols_anchors": cols_anchors, "cols_anchors": cols_anchors,
"expanded_bbox": expanded_bbox "expanded_bbox": expanded_bbox
@ -1148,13 +1141,13 @@ class Hybrid(BaseParser):
return table return table
def extract_tables(self, debug_info=None): def extract_tables(self):
if self._document_has_no_text(): if self._document_has_no_text():
return [] return []
# Identify plausible areas within the doc where tables lie, # Identify plausible areas within the doc where tables lie,
# populate table_bbox keys with these areas. # populate table_bbox keys with these areas.
self._generate_table_bbox(debug_info) self._generate_table_bbox()
_tables = [] _tables = []
# sort tables based on y-coord # sort tables based on y-coord

View File

@ -164,6 +164,20 @@ class PlotMethods(object):
ax.imshow(img, extent=(0, table.pdf_size[0], 0, table.pdf_size[1])) ax.imshow(img, extent=(0, table.pdf_size[0], 0, table.pdf_size[1]))
return fig return fig
@staticmethod
def draw_pdf(table, ax):
"""Draw the content of the table's source pdf into the passed subplot
Parameters
----------
table : camelot.core.Table
fig : matplotlib.axes.Axes
"""
img = table.get_pdf_image()
ax.imshow(img, extent=(0, table.pdf_size[0], 0, table.pdf_size[1]))
@staticmethod @staticmethod
def textedge(table): def textedge(table):
"""Generates a plot for relevant textedges. """Generates a plot for relevant textedges.
@ -179,6 +193,7 @@ class PlotMethods(object):
""" """
fig = plt.figure() fig = plt.figure()
ax = fig.add_subplot(111, aspect="equal") ax = fig.add_subplot(111, aspect="equal")
PlotMethods.draw_pdf(table, ax)
xs, ys = [], [] xs, ys = [], []
for t in table._text: for t in table._text:
xs.extend([t[0], t[2]]) xs.extend([t[0], t[2]])
@ -193,11 +208,13 @@ class PlotMethods(object):
ax.set_xlim(min(xs) - 10, max(xs) + 10) ax.set_xlim(min(xs) - 10, max(xs) + 10)
ax.set_ylim(min(ys) - 10, max(ys) + 10) ax.set_ylim(min(ys) - 10, max(ys) + 10)
for te in table._textedges: if table.flavor == "hybrid":
ax.plot([te.x, te.x], [te.y0, te.y1]) # FRHTODO: Clean this up
table.debug_info["edges_searches"][0].plot_alignments(ax)
else:
for te in table._textedges:
ax.plot([te.x, te.x], [te.y0, te.y1])
img = table.get_pdf_image()
ax.imshow(img, extent=(0, table.pdf_size[0], 0, table.pdf_size[1]))
return fig return fig
@staticmethod @staticmethod

Binary file not shown.

Before

Width:  |  Height:  |  Size: 15 KiB

After

Width:  |  Height:  |  Size: 16 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 49 KiB

After

Width:  |  Height:  |  Size: 48 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 113 KiB

After

Width:  |  Height:  |  Size: 197 KiB

View File

@ -129,5 +129,5 @@ def test_stream_textedge_plot():
baseline_dir="files/baseline_plots", remove_text=True) baseline_dir="files/baseline_plots", remove_text=True)
def test_hybrid_textedge_plot(): def test_hybrid_textedge_plot():
filename = os.path.join(testdir, "tabula/12s0324.pdf") filename = os.path.join(testdir, "tabula/12s0324.pdf")
tables = camelot.read_pdf(filename, flavor='hybrid') tables = camelot.read_pdf(filename, debug=True, flavor='hybrid')
return camelot.plot(tables[0], kind='textedge') return camelot.plot(tables[0], kind='textedge')