Improve hybrid plotting
* plot info passed through debug_info * display each text edgepull/153/head
parent
e0e3ff4e07
commit
1ccaa0630d
|
|
@ -40,10 +40,13 @@ class PDFHandler():
|
|||
Example: '1,3,4' or '1,4-end' or 'all'.
|
||||
password : str, optional (default: None)
|
||||
Password for decryption.
|
||||
debug : bool, optional (default: False)
|
||||
Whether the parser should store debug information during parsing.
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, filepath, pages="1", password=None):
|
||||
def __init__(self, filepath, pages="1", password=None, debug=False):
|
||||
self.debug = debug
|
||||
if is_url(filepath):
|
||||
filepath = download_url(filepath)
|
||||
self.filepath = filepath
|
||||
|
|
@ -193,7 +196,7 @@ class PDFHandler():
|
|||
tables = []
|
||||
|
||||
parser_obj = PARSERS[flavor]
|
||||
parser = parser_obj(**kwargs)
|
||||
parser = parser_obj(debug=self.debug, **kwargs)
|
||||
|
||||
# Read the layouts/dimensions of each of the pages we need to
|
||||
# parse. This might require creating a temporary .pdf.
|
||||
|
|
@ -204,8 +207,8 @@ class PDFHandler():
|
|||
)
|
||||
parser.prepare_page_parse(source_file, layout, dimensions,
|
||||
page_idx, layout_kwargs)
|
||||
rootname = os.path.basename(parser.rootname)
|
||||
if not suppress_stdout:
|
||||
rootname = os.path.basename(parser.rootname)
|
||||
logger.info(
|
||||
"Processing {rootname}".format(rootname=rootname))
|
||||
t = parser.extract_tables()
|
||||
|
|
|
|||
|
|
@ -13,6 +13,7 @@ def read_pdf(
|
|||
flavor="lattice",
|
||||
suppress_stdout=False,
|
||||
layout_kwargs=None,
|
||||
debug=False,
|
||||
**kwargs
|
||||
):
|
||||
"""Read PDF and return extracted tables.
|
||||
|
|
@ -110,7 +111,7 @@ def read_pdf(
|
|||
warnings.simplefilter("ignore")
|
||||
|
||||
validate_input(kwargs, flavor=flavor)
|
||||
p = PDFHandler(filepath, pages=pages, password=password)
|
||||
p = PDFHandler(filepath, pages=pages, password=password, debug=debug)
|
||||
kwargs = remove_extra(kwargs, flavor=flavor)
|
||||
tables = p.parse(
|
||||
flavor=flavor,
|
||||
|
|
|
|||
|
|
@ -5,7 +5,9 @@ import warnings
|
|||
|
||||
from ..utils import (
|
||||
get_text_objects,
|
||||
get_table_index
|
||||
get_table_index,
|
||||
text_in_bbox,
|
||||
bbox_from_str,
|
||||
)
|
||||
from ..core import Table
|
||||
|
||||
|
|
@ -65,7 +67,39 @@ class BaseParser(object):
|
|||
self.debug_info["table_regions"] = self.table_regions
|
||||
self.debug_info["table_areas"] = self.table_areas
|
||||
|
||||
def _apply_regions_filter(self, textlines):
|
||||
"""If regions have been specified, filter textlines to these regions.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
textlines : list
|
||||
list of textlines to be filtered
|
||||
|
||||
Returns
|
||||
-------
|
||||
filtered_textlines : list of textlines within the regions specified
|
||||
|
||||
"""
|
||||
filtered_textlines = []
|
||||
if self.table_regions is None:
|
||||
filtered_textlines.extend(textlines)
|
||||
else:
|
||||
for region_str in self.table_regions:
|
||||
region_text = text_in_bbox(
|
||||
bbox_from_str(region_str),
|
||||
textlines
|
||||
)
|
||||
filtered_textlines.extend(region_text)
|
||||
return filtered_textlines
|
||||
|
||||
def _document_has_no_text(self):
|
||||
"""Detects image only documents and warns.
|
||||
|
||||
Returns
|
||||
-------
|
||||
has_no_text : bool
|
||||
Whether the document doesn't have any text at all.
|
||||
"""
|
||||
if not self.horizontal_text:
|
||||
rootname = os.path.basename(self.rootname)
|
||||
if self.images:
|
||||
|
|
@ -81,6 +115,7 @@ class BaseParser(object):
|
|||
return True
|
||||
return False
|
||||
|
||||
def _initialize_new_table(self, table_idx, cols, rows):
|
||||
"""Initialize new table object, ready to be populated
|
||||
|
||||
Parameters
|
||||
|
|
@ -97,7 +132,6 @@ class BaseParser(object):
|
|||
table : camelot.core.Table
|
||||
|
||||
"""
|
||||
def _initialize_new_table(self, table_idx, cols, rows):
|
||||
table = Table(cols, rows)
|
||||
table.page = self.page
|
||||
table.order = table_idx + 1
|
||||
|
|
|
|||
|
|
@ -2,7 +2,7 @@
|
|||
from __future__ import division
|
||||
|
||||
import numpy as np
|
||||
|
||||
import copy
|
||||
import warnings
|
||||
|
||||
from .base import BaseParser
|
||||
|
|
@ -459,7 +459,6 @@ class TextEdges2(object):
|
|||
or horizontally. There needs to be connections across both
|
||||
dimensions.
|
||||
"""
|
||||
singleton_textlines = []
|
||||
removed_singletons = True
|
||||
while removed_singletons:
|
||||
removed_singletons = False
|
||||
|
|
@ -471,7 +470,6 @@ class TextEdges2(object):
|
|||
tl = te.textlines[i]
|
||||
alignments = self._textlines_alignments[tl]
|
||||
if alignments.max_h() <= 1 or alignments.max_v() <= 1:
|
||||
singleton_textlines.append(tl)
|
||||
del te.textlines[i]
|
||||
removed_singletons = True
|
||||
self._textlines_alignments = {}
|
||||
|
|
@ -612,33 +610,27 @@ class TextEdges2(object):
|
|||
self._register_all_text_lines(textlines)
|
||||
self._compute_alignment_counts()
|
||||
|
||||
def plotFRHAlignments(self, table, plt):
|
||||
def plot_alignments(self, ax):
|
||||
"""Displays a visualization of the alignments as currently computed.
|
||||
"""
|
||||
fig = plt.figure()
|
||||
ax = fig.add_subplot(111, aspect="equal")
|
||||
img = table.get_pdf_image()
|
||||
ax.imshow(img, extent=(0, table.pdf_size[0], 0, table.pdf_size[1]))
|
||||
|
||||
tls_by_alignment_score = sorted(
|
||||
# FRHTODO: This is too busy and doesn't plot lines
|
||||
most_aligned_tl = sorted(
|
||||
self._textlines_alignments.keys(),
|
||||
key=lambda textline:
|
||||
self._textlines_alignments[textline].alignment_score(),
|
||||
reverse=True
|
||||
)
|
||||
)[0]
|
||||
|
||||
for tl, alignments in self._textlines_alignments.items():
|
||||
color = "red"
|
||||
if tl == tls_by_alignment_score[0]:
|
||||
color = "blue"
|
||||
ax.add_patch(
|
||||
patches.Rectangle(
|
||||
(tl.x0, tl.y0),
|
||||
tl.x1 - tl.x0, tl.y1 - tl.y0,
|
||||
color=color,
|
||||
(most_aligned_tl.x0, most_aligned_tl.y0),
|
||||
most_aligned_tl.x1 - most_aligned_tl.x0,
|
||||
most_aligned_tl.y1 - most_aligned_tl.y0,
|
||||
color="red",
|
||||
alpha=0.5
|
||||
)
|
||||
)
|
||||
for tl, alignments in self._textlines_alignments.items():
|
||||
ax.text(
|
||||
tl.x0 - 5,
|
||||
tl.y0 - 5,
|
||||
|
|
@ -749,6 +741,7 @@ class Hybrid(BaseParser):
|
|||
edge_tol=50,
|
||||
row_tol=2,
|
||||
column_tol=0,
|
||||
debug=False,
|
||||
**kwargs
|
||||
):
|
||||
super().__init__(
|
||||
|
|
@ -758,6 +751,7 @@ class Hybrid(BaseParser):
|
|||
split_text=split_text,
|
||||
strip_text=strip_text,
|
||||
flag_size=flag_size,
|
||||
debug=debug
|
||||
)
|
||||
self.columns = columns
|
||||
self._validate_columns()
|
||||
|
|
@ -971,8 +965,7 @@ class Hybrid(BaseParser):
|
|||
raise ValueError("Length of table_areas and columns"
|
||||
" should be equal")
|
||||
|
||||
# FRHTODO: get debug_info to work again
|
||||
def _generate_table_bbox(self, debug_info=None):
|
||||
def _generate_table_bbox(self):
|
||||
if self.table_areas is not None:
|
||||
table_bbox = {}
|
||||
for area_str in self.table_areas:
|
||||
|
|
@ -981,32 +974,30 @@ class Hybrid(BaseParser):
|
|||
return
|
||||
|
||||
all_textlines = self.horizontal_text + self.vertical_text
|
||||
textlines = []
|
||||
if self.table_regions is None:
|
||||
textlines = all_textlines
|
||||
else:
|
||||
# filter text
|
||||
for region_str in self.table_regions:
|
||||
region_text = text_in_bbox(
|
||||
bbox_from_str(region_str),
|
||||
all_textlines
|
||||
)
|
||||
textlines.extend(region_text)
|
||||
textlines = self._apply_regions_filter(all_textlines)
|
||||
|
||||
textlines_processed = {}
|
||||
self.table_bbox = {}
|
||||
if debug_info is not None:
|
||||
debug_info_bbox_searches = []
|
||||
debug_info["bboxes_searches"] = debug_info_bbox_searches
|
||||
if self.debug_info is not None:
|
||||
debug_info_edges_searches = []
|
||||
self.debug_info["edges_searches"] = debug_info_edges_searches
|
||||
debug_info_bboxes_searches = []
|
||||
self.debug_info["bboxes_searches"] = debug_info_bboxes_searches
|
||||
else:
|
||||
debug_info_bbox_searches = None
|
||||
debug_info_edges_searches = None
|
||||
debug_info_bboxes_searches = None
|
||||
|
||||
while True:
|
||||
self.textedges = TextEdges2()
|
||||
self.textedges.generate(textlines)
|
||||
self.textedges._remove_unconnected_edges()
|
||||
if debug_info_edges_searches is not None:
|
||||
# Preserve the current edge calculation for display debugging
|
||||
debug_info_edges_searches.append(
|
||||
copy.deepcopy(self.textedges)
|
||||
)
|
||||
bbox = self.textedges._build_bbox_candidate(
|
||||
debug_info_bbox_searches
|
||||
debug_info_bboxes_searches
|
||||
)
|
||||
if bbox is None:
|
||||
break
|
||||
|
|
@ -1040,8 +1031,10 @@ class Hybrid(BaseParser):
|
|||
average_tl_height
|
||||
)
|
||||
|
||||
if debug_info is not None:
|
||||
debug_info["col_searches"].append({
|
||||
if self.debug_info is not None:
|
||||
if "col_searches" not in self.debug_info:
|
||||
self.debug_info["col_searches"] = []
|
||||
self.debug_info["col_searches"].append({
|
||||
"core_bbox": bbox,
|
||||
"cols_anchors": cols_anchors,
|
||||
"expanded_bbox": expanded_bbox
|
||||
|
|
@ -1148,13 +1141,13 @@ class Hybrid(BaseParser):
|
|||
|
||||
return table
|
||||
|
||||
def extract_tables(self, debug_info=None):
|
||||
def extract_tables(self):
|
||||
if self._document_has_no_text():
|
||||
return []
|
||||
|
||||
# Identify plausible areas within the doc where tables lie,
|
||||
# populate table_bbox keys with these areas.
|
||||
self._generate_table_bbox(debug_info)
|
||||
self._generate_table_bbox()
|
||||
|
||||
_tables = []
|
||||
# sort tables based on y-coord
|
||||
|
|
|
|||
|
|
@ -164,6 +164,20 @@ class PlotMethods(object):
|
|||
ax.imshow(img, extent=(0, table.pdf_size[0], 0, table.pdf_size[1]))
|
||||
return fig
|
||||
|
||||
@staticmethod
|
||||
def draw_pdf(table, ax):
|
||||
"""Draw the content of the table's source pdf into the passed subplot
|
||||
|
||||
Parameters
|
||||
----------
|
||||
table : camelot.core.Table
|
||||
|
||||
fig : matplotlib.axes.Axes
|
||||
|
||||
"""
|
||||
img = table.get_pdf_image()
|
||||
ax.imshow(img, extent=(0, table.pdf_size[0], 0, table.pdf_size[1]))
|
||||
|
||||
@staticmethod
|
||||
def textedge(table):
|
||||
"""Generates a plot for relevant textedges.
|
||||
|
|
@ -179,6 +193,7 @@ class PlotMethods(object):
|
|||
"""
|
||||
fig = plt.figure()
|
||||
ax = fig.add_subplot(111, aspect="equal")
|
||||
PlotMethods.draw_pdf(table, ax)
|
||||
xs, ys = [], []
|
||||
for t in table._text:
|
||||
xs.extend([t[0], t[2]])
|
||||
|
|
@ -193,11 +208,13 @@ class PlotMethods(object):
|
|||
ax.set_xlim(min(xs) - 10, max(xs) + 10)
|
||||
ax.set_ylim(min(ys) - 10, max(ys) + 10)
|
||||
|
||||
if table.flavor == "hybrid":
|
||||
# FRHTODO: Clean this up
|
||||
table.debug_info["edges_searches"][0].plot_alignments(ax)
|
||||
else:
|
||||
for te in table._textedges:
|
||||
ax.plot([te.x, te.x], [te.y0, te.y1])
|
||||
|
||||
img = table.get_pdf_image()
|
||||
ax.imshow(img, extent=(0, table.pdf_size[0], 0, table.pdf_size[1]))
|
||||
return fig
|
||||
|
||||
@staticmethod
|
||||
|
|
|
|||
Binary file not shown.
|
Before Width: | Height: | Size: 15 KiB After Width: | Height: | Size: 16 KiB |
Binary file not shown.
|
Before Width: | Height: | Size: 49 KiB After Width: | Height: | Size: 48 KiB |
Binary file not shown.
|
Before Width: | Height: | Size: 113 KiB After Width: | Height: | Size: 197 KiB |
|
|
@ -129,5 +129,5 @@ def test_stream_textedge_plot():
|
|||
baseline_dir="files/baseline_plots", remove_text=True)
|
||||
def test_hybrid_textedge_plot():
|
||||
filename = os.path.join(testdir, "tabula/12s0324.pdf")
|
||||
tables = camelot.read_pdf(filename, flavor='hybrid')
|
||||
tables = camelot.read_pdf(filename, debug=True, flavor='hybrid')
|
||||
return camelot.plot(tables[0], kind='textedge')
|
||||
|
|
|
|||
Loading…
Reference in New Issue