Improve hybrid plotting
* plot info passed through debug_info * display each text edgepull/153/head
parent
e0e3ff4e07
commit
1ccaa0630d
|
|
@ -40,10 +40,13 @@ class PDFHandler():
|
||||||
Example: '1,3,4' or '1,4-end' or 'all'.
|
Example: '1,3,4' or '1,4-end' or 'all'.
|
||||||
password : str, optional (default: None)
|
password : str, optional (default: None)
|
||||||
Password for decryption.
|
Password for decryption.
|
||||||
|
debug : bool, optional (default: False)
|
||||||
|
Whether the parser should store debug information during parsing.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, filepath, pages="1", password=None):
|
def __init__(self, filepath, pages="1", password=None, debug=False):
|
||||||
|
self.debug = debug
|
||||||
if is_url(filepath):
|
if is_url(filepath):
|
||||||
filepath = download_url(filepath)
|
filepath = download_url(filepath)
|
||||||
self.filepath = filepath
|
self.filepath = filepath
|
||||||
|
|
@ -193,7 +196,7 @@ class PDFHandler():
|
||||||
tables = []
|
tables = []
|
||||||
|
|
||||||
parser_obj = PARSERS[flavor]
|
parser_obj = PARSERS[flavor]
|
||||||
parser = parser_obj(**kwargs)
|
parser = parser_obj(debug=self.debug, **kwargs)
|
||||||
|
|
||||||
# Read the layouts/dimensions of each of the pages we need to
|
# Read the layouts/dimensions of each of the pages we need to
|
||||||
# parse. This might require creating a temporary .pdf.
|
# parse. This might require creating a temporary .pdf.
|
||||||
|
|
@ -204,8 +207,8 @@ class PDFHandler():
|
||||||
)
|
)
|
||||||
parser.prepare_page_parse(source_file, layout, dimensions,
|
parser.prepare_page_parse(source_file, layout, dimensions,
|
||||||
page_idx, layout_kwargs)
|
page_idx, layout_kwargs)
|
||||||
rootname = os.path.basename(parser.rootname)
|
|
||||||
if not suppress_stdout:
|
if not suppress_stdout:
|
||||||
|
rootname = os.path.basename(parser.rootname)
|
||||||
logger.info(
|
logger.info(
|
||||||
"Processing {rootname}".format(rootname=rootname))
|
"Processing {rootname}".format(rootname=rootname))
|
||||||
t = parser.extract_tables()
|
t = parser.extract_tables()
|
||||||
|
|
|
||||||
|
|
@ -13,6 +13,7 @@ def read_pdf(
|
||||||
flavor="lattice",
|
flavor="lattice",
|
||||||
suppress_stdout=False,
|
suppress_stdout=False,
|
||||||
layout_kwargs=None,
|
layout_kwargs=None,
|
||||||
|
debug=False,
|
||||||
**kwargs
|
**kwargs
|
||||||
):
|
):
|
||||||
"""Read PDF and return extracted tables.
|
"""Read PDF and return extracted tables.
|
||||||
|
|
@ -110,7 +111,7 @@ def read_pdf(
|
||||||
warnings.simplefilter("ignore")
|
warnings.simplefilter("ignore")
|
||||||
|
|
||||||
validate_input(kwargs, flavor=flavor)
|
validate_input(kwargs, flavor=flavor)
|
||||||
p = PDFHandler(filepath, pages=pages, password=password)
|
p = PDFHandler(filepath, pages=pages, password=password, debug=debug)
|
||||||
kwargs = remove_extra(kwargs, flavor=flavor)
|
kwargs = remove_extra(kwargs, flavor=flavor)
|
||||||
tables = p.parse(
|
tables = p.parse(
|
||||||
flavor=flavor,
|
flavor=flavor,
|
||||||
|
|
|
||||||
|
|
@ -5,7 +5,9 @@ import warnings
|
||||||
|
|
||||||
from ..utils import (
|
from ..utils import (
|
||||||
get_text_objects,
|
get_text_objects,
|
||||||
get_table_index
|
get_table_index,
|
||||||
|
text_in_bbox,
|
||||||
|
bbox_from_str,
|
||||||
)
|
)
|
||||||
from ..core import Table
|
from ..core import Table
|
||||||
|
|
||||||
|
|
@ -65,7 +67,39 @@ class BaseParser(object):
|
||||||
self.debug_info["table_regions"] = self.table_regions
|
self.debug_info["table_regions"] = self.table_regions
|
||||||
self.debug_info["table_areas"] = self.table_areas
|
self.debug_info["table_areas"] = self.table_areas
|
||||||
|
|
||||||
|
def _apply_regions_filter(self, textlines):
|
||||||
|
"""If regions have been specified, filter textlines to these regions.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
textlines : list
|
||||||
|
list of textlines to be filtered
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
filtered_textlines : list of textlines within the regions specified
|
||||||
|
|
||||||
|
"""
|
||||||
|
filtered_textlines = []
|
||||||
|
if self.table_regions is None:
|
||||||
|
filtered_textlines.extend(textlines)
|
||||||
|
else:
|
||||||
|
for region_str in self.table_regions:
|
||||||
|
region_text = text_in_bbox(
|
||||||
|
bbox_from_str(region_str),
|
||||||
|
textlines
|
||||||
|
)
|
||||||
|
filtered_textlines.extend(region_text)
|
||||||
|
return filtered_textlines
|
||||||
|
|
||||||
def _document_has_no_text(self):
|
def _document_has_no_text(self):
|
||||||
|
"""Detects image only documents and warns.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
has_no_text : bool
|
||||||
|
Whether the document doesn't have any text at all.
|
||||||
|
"""
|
||||||
if not self.horizontal_text:
|
if not self.horizontal_text:
|
||||||
rootname = os.path.basename(self.rootname)
|
rootname = os.path.basename(self.rootname)
|
||||||
if self.images:
|
if self.images:
|
||||||
|
|
@ -81,23 +115,23 @@ class BaseParser(object):
|
||||||
return True
|
return True
|
||||||
return False
|
return False
|
||||||
|
|
||||||
"""Initialize new table object, ready to be populated
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
table_idx : int
|
|
||||||
Index of this table within the pdf page analyzed
|
|
||||||
cols : list
|
|
||||||
list of coordinate boundaries tuples (left, right)
|
|
||||||
rows : list
|
|
||||||
list of coordinate boundaries tuples (bottom, top)
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
table : camelot.core.Table
|
|
||||||
|
|
||||||
"""
|
|
||||||
def _initialize_new_table(self, table_idx, cols, rows):
|
def _initialize_new_table(self, table_idx, cols, rows):
|
||||||
|
"""Initialize new table object, ready to be populated
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
table_idx : int
|
||||||
|
Index of this table within the pdf page analyzed
|
||||||
|
cols : list
|
||||||
|
list of coordinate boundaries tuples (left, right)
|
||||||
|
rows : list
|
||||||
|
list of coordinate boundaries tuples (bottom, top)
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
table : camelot.core.Table
|
||||||
|
|
||||||
|
"""
|
||||||
table = Table(cols, rows)
|
table = Table(cols, rows)
|
||||||
table.page = self.page
|
table.page = self.page
|
||||||
table.order = table_idx + 1
|
table.order = table_idx + 1
|
||||||
|
|
|
||||||
|
|
@ -2,7 +2,7 @@
|
||||||
from __future__ import division
|
from __future__ import division
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
import copy
|
||||||
import warnings
|
import warnings
|
||||||
|
|
||||||
from .base import BaseParser
|
from .base import BaseParser
|
||||||
|
|
@ -459,7 +459,6 @@ class TextEdges2(object):
|
||||||
or horizontally. There needs to be connections across both
|
or horizontally. There needs to be connections across both
|
||||||
dimensions.
|
dimensions.
|
||||||
"""
|
"""
|
||||||
singleton_textlines = []
|
|
||||||
removed_singletons = True
|
removed_singletons = True
|
||||||
while removed_singletons:
|
while removed_singletons:
|
||||||
removed_singletons = False
|
removed_singletons = False
|
||||||
|
|
@ -471,7 +470,6 @@ class TextEdges2(object):
|
||||||
tl = te.textlines[i]
|
tl = te.textlines[i]
|
||||||
alignments = self._textlines_alignments[tl]
|
alignments = self._textlines_alignments[tl]
|
||||||
if alignments.max_h() <= 1 or alignments.max_v() <= 1:
|
if alignments.max_h() <= 1 or alignments.max_v() <= 1:
|
||||||
singleton_textlines.append(tl)
|
|
||||||
del te.textlines[i]
|
del te.textlines[i]
|
||||||
removed_singletons = True
|
removed_singletons = True
|
||||||
self._textlines_alignments = {}
|
self._textlines_alignments = {}
|
||||||
|
|
@ -612,33 +610,27 @@ class TextEdges2(object):
|
||||||
self._register_all_text_lines(textlines)
|
self._register_all_text_lines(textlines)
|
||||||
self._compute_alignment_counts()
|
self._compute_alignment_counts()
|
||||||
|
|
||||||
def plotFRHAlignments(self, table, plt):
|
def plot_alignments(self, ax):
|
||||||
"""Displays a visualization of the alignments as currently computed.
|
"""Displays a visualization of the alignments as currently computed.
|
||||||
"""
|
"""
|
||||||
fig = plt.figure()
|
# FRHTODO: This is too busy and doesn't plot lines
|
||||||
ax = fig.add_subplot(111, aspect="equal")
|
most_aligned_tl = sorted(
|
||||||
img = table.get_pdf_image()
|
|
||||||
ax.imshow(img, extent=(0, table.pdf_size[0], 0, table.pdf_size[1]))
|
|
||||||
|
|
||||||
tls_by_alignment_score = sorted(
|
|
||||||
self._textlines_alignments.keys(),
|
self._textlines_alignments.keys(),
|
||||||
key=lambda textline:
|
key=lambda textline:
|
||||||
self._textlines_alignments[textline].alignment_score(),
|
self._textlines_alignments[textline].alignment_score(),
|
||||||
reverse=True
|
reverse=True
|
||||||
)
|
)[0]
|
||||||
|
|
||||||
for tl, alignments in self._textlines_alignments.items():
|
ax.add_patch(
|
||||||
color = "red"
|
patches.Rectangle(
|
||||||
if tl == tls_by_alignment_score[0]:
|
(most_aligned_tl.x0, most_aligned_tl.y0),
|
||||||
color = "blue"
|
most_aligned_tl.x1 - most_aligned_tl.x0,
|
||||||
ax.add_patch(
|
most_aligned_tl.y1 - most_aligned_tl.y0,
|
||||||
patches.Rectangle(
|
color="red",
|
||||||
(tl.x0, tl.y0),
|
alpha=0.5
|
||||||
tl.x1 - tl.x0, tl.y1 - tl.y0,
|
|
||||||
color=color,
|
|
||||||
alpha=0.5
|
|
||||||
)
|
|
||||||
)
|
)
|
||||||
|
)
|
||||||
|
for tl, alignments in self._textlines_alignments.items():
|
||||||
ax.text(
|
ax.text(
|
||||||
tl.x0 - 5,
|
tl.x0 - 5,
|
||||||
tl.y0 - 5,
|
tl.y0 - 5,
|
||||||
|
|
@ -749,6 +741,7 @@ class Hybrid(BaseParser):
|
||||||
edge_tol=50,
|
edge_tol=50,
|
||||||
row_tol=2,
|
row_tol=2,
|
||||||
column_tol=0,
|
column_tol=0,
|
||||||
|
debug=False,
|
||||||
**kwargs
|
**kwargs
|
||||||
):
|
):
|
||||||
super().__init__(
|
super().__init__(
|
||||||
|
|
@ -758,6 +751,7 @@ class Hybrid(BaseParser):
|
||||||
split_text=split_text,
|
split_text=split_text,
|
||||||
strip_text=strip_text,
|
strip_text=strip_text,
|
||||||
flag_size=flag_size,
|
flag_size=flag_size,
|
||||||
|
debug=debug
|
||||||
)
|
)
|
||||||
self.columns = columns
|
self.columns = columns
|
||||||
self._validate_columns()
|
self._validate_columns()
|
||||||
|
|
@ -971,8 +965,7 @@ class Hybrid(BaseParser):
|
||||||
raise ValueError("Length of table_areas and columns"
|
raise ValueError("Length of table_areas and columns"
|
||||||
" should be equal")
|
" should be equal")
|
||||||
|
|
||||||
# FRHTODO: get debug_info to work again
|
def _generate_table_bbox(self):
|
||||||
def _generate_table_bbox(self, debug_info=None):
|
|
||||||
if self.table_areas is not None:
|
if self.table_areas is not None:
|
||||||
table_bbox = {}
|
table_bbox = {}
|
||||||
for area_str in self.table_areas:
|
for area_str in self.table_areas:
|
||||||
|
|
@ -981,32 +974,30 @@ class Hybrid(BaseParser):
|
||||||
return
|
return
|
||||||
|
|
||||||
all_textlines = self.horizontal_text + self.vertical_text
|
all_textlines = self.horizontal_text + self.vertical_text
|
||||||
textlines = []
|
textlines = self._apply_regions_filter(all_textlines)
|
||||||
if self.table_regions is None:
|
|
||||||
textlines = all_textlines
|
|
||||||
else:
|
|
||||||
# filter text
|
|
||||||
for region_str in self.table_regions:
|
|
||||||
region_text = text_in_bbox(
|
|
||||||
bbox_from_str(region_str),
|
|
||||||
all_textlines
|
|
||||||
)
|
|
||||||
textlines.extend(region_text)
|
|
||||||
|
|
||||||
textlines_processed = {}
|
textlines_processed = {}
|
||||||
self.table_bbox = {}
|
self.table_bbox = {}
|
||||||
if debug_info is not None:
|
if self.debug_info is not None:
|
||||||
debug_info_bbox_searches = []
|
debug_info_edges_searches = []
|
||||||
debug_info["bboxes_searches"] = debug_info_bbox_searches
|
self.debug_info["edges_searches"] = debug_info_edges_searches
|
||||||
|
debug_info_bboxes_searches = []
|
||||||
|
self.debug_info["bboxes_searches"] = debug_info_bboxes_searches
|
||||||
else:
|
else:
|
||||||
debug_info_bbox_searches = None
|
debug_info_edges_searches = None
|
||||||
|
debug_info_bboxes_searches = None
|
||||||
|
|
||||||
while True:
|
while True:
|
||||||
self.textedges = TextEdges2()
|
self.textedges = TextEdges2()
|
||||||
self.textedges.generate(textlines)
|
self.textedges.generate(textlines)
|
||||||
self.textedges._remove_unconnected_edges()
|
self.textedges._remove_unconnected_edges()
|
||||||
|
if debug_info_edges_searches is not None:
|
||||||
|
# Preserve the current edge calculation for display debugging
|
||||||
|
debug_info_edges_searches.append(
|
||||||
|
copy.deepcopy(self.textedges)
|
||||||
|
)
|
||||||
bbox = self.textedges._build_bbox_candidate(
|
bbox = self.textedges._build_bbox_candidate(
|
||||||
debug_info_bbox_searches
|
debug_info_bboxes_searches
|
||||||
)
|
)
|
||||||
if bbox is None:
|
if bbox is None:
|
||||||
break
|
break
|
||||||
|
|
@ -1040,8 +1031,10 @@ class Hybrid(BaseParser):
|
||||||
average_tl_height
|
average_tl_height
|
||||||
)
|
)
|
||||||
|
|
||||||
if debug_info is not None:
|
if self.debug_info is not None:
|
||||||
debug_info["col_searches"].append({
|
if "col_searches" not in self.debug_info:
|
||||||
|
self.debug_info["col_searches"] = []
|
||||||
|
self.debug_info["col_searches"].append({
|
||||||
"core_bbox": bbox,
|
"core_bbox": bbox,
|
||||||
"cols_anchors": cols_anchors,
|
"cols_anchors": cols_anchors,
|
||||||
"expanded_bbox": expanded_bbox
|
"expanded_bbox": expanded_bbox
|
||||||
|
|
@ -1148,13 +1141,13 @@ class Hybrid(BaseParser):
|
||||||
|
|
||||||
return table
|
return table
|
||||||
|
|
||||||
def extract_tables(self, debug_info=None):
|
def extract_tables(self):
|
||||||
if self._document_has_no_text():
|
if self._document_has_no_text():
|
||||||
return []
|
return []
|
||||||
|
|
||||||
# Identify plausible areas within the doc where tables lie,
|
# Identify plausible areas within the doc where tables lie,
|
||||||
# populate table_bbox keys with these areas.
|
# populate table_bbox keys with these areas.
|
||||||
self._generate_table_bbox(debug_info)
|
self._generate_table_bbox()
|
||||||
|
|
||||||
_tables = []
|
_tables = []
|
||||||
# sort tables based on y-coord
|
# sort tables based on y-coord
|
||||||
|
|
|
||||||
|
|
@ -164,6 +164,20 @@ class PlotMethods(object):
|
||||||
ax.imshow(img, extent=(0, table.pdf_size[0], 0, table.pdf_size[1]))
|
ax.imshow(img, extent=(0, table.pdf_size[0], 0, table.pdf_size[1]))
|
||||||
return fig
|
return fig
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def draw_pdf(table, ax):
|
||||||
|
"""Draw the content of the table's source pdf into the passed subplot
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
table : camelot.core.Table
|
||||||
|
|
||||||
|
fig : matplotlib.axes.Axes
|
||||||
|
|
||||||
|
"""
|
||||||
|
img = table.get_pdf_image()
|
||||||
|
ax.imshow(img, extent=(0, table.pdf_size[0], 0, table.pdf_size[1]))
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def textedge(table):
|
def textedge(table):
|
||||||
"""Generates a plot for relevant textedges.
|
"""Generates a plot for relevant textedges.
|
||||||
|
|
@ -179,6 +193,7 @@ class PlotMethods(object):
|
||||||
"""
|
"""
|
||||||
fig = plt.figure()
|
fig = plt.figure()
|
||||||
ax = fig.add_subplot(111, aspect="equal")
|
ax = fig.add_subplot(111, aspect="equal")
|
||||||
|
PlotMethods.draw_pdf(table, ax)
|
||||||
xs, ys = [], []
|
xs, ys = [], []
|
||||||
for t in table._text:
|
for t in table._text:
|
||||||
xs.extend([t[0], t[2]])
|
xs.extend([t[0], t[2]])
|
||||||
|
|
@ -193,11 +208,13 @@ class PlotMethods(object):
|
||||||
ax.set_xlim(min(xs) - 10, max(xs) + 10)
|
ax.set_xlim(min(xs) - 10, max(xs) + 10)
|
||||||
ax.set_ylim(min(ys) - 10, max(ys) + 10)
|
ax.set_ylim(min(ys) - 10, max(ys) + 10)
|
||||||
|
|
||||||
for te in table._textedges:
|
if table.flavor == "hybrid":
|
||||||
ax.plot([te.x, te.x], [te.y0, te.y1])
|
# FRHTODO: Clean this up
|
||||||
|
table.debug_info["edges_searches"][0].plot_alignments(ax)
|
||||||
|
else:
|
||||||
|
for te in table._textedges:
|
||||||
|
ax.plot([te.x, te.x], [te.y0, te.y1])
|
||||||
|
|
||||||
img = table.get_pdf_image()
|
|
||||||
ax.imshow(img, extent=(0, table.pdf_size[0], 0, table.pdf_size[1]))
|
|
||||||
return fig
|
return fig
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
|
|
|
||||||
Binary file not shown.
|
Before Width: | Height: | Size: 15 KiB After Width: | Height: | Size: 16 KiB |
Binary file not shown.
|
Before Width: | Height: | Size: 49 KiB After Width: | Height: | Size: 48 KiB |
Binary file not shown.
|
Before Width: | Height: | Size: 113 KiB After Width: | Height: | Size: 197 KiB |
|
|
@ -129,5 +129,5 @@ def test_stream_textedge_plot():
|
||||||
baseline_dir="files/baseline_plots", remove_text=True)
|
baseline_dir="files/baseline_plots", remove_text=True)
|
||||||
def test_hybrid_textedge_plot():
|
def test_hybrid_textedge_plot():
|
||||||
filename = os.path.join(testdir, "tabula/12s0324.pdf")
|
filename = os.path.join(testdir, "tabula/12s0324.pdf")
|
||||||
tables = camelot.read_pdf(filename, flavor='hybrid')
|
tables = camelot.read_pdf(filename, debug=True, flavor='hybrid')
|
||||||
return camelot.plot(tables[0], kind='textedge')
|
return camelot.plot(tables[0], kind='textedge')
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue