diff --git a/camelot/handlers.py b/camelot/handlers.py index 5896497..bc4091e 100644 --- a/camelot/handlers.py +++ b/camelot/handlers.py @@ -40,10 +40,13 @@ class PDFHandler(): Example: '1,3,4' or '1,4-end' or 'all'. password : str, optional (default: None) Password for decryption. + debug : bool, optional (default: False) + Whether the parser should store debug information during parsing. """ - def __init__(self, filepath, pages="1", password=None): + def __init__(self, filepath, pages="1", password=None, debug=False): + self.debug = debug if is_url(filepath): filepath = download_url(filepath) self.filepath = filepath @@ -193,7 +196,7 @@ class PDFHandler(): tables = [] parser_obj = PARSERS[flavor] - parser = parser_obj(**kwargs) + parser = parser_obj(debug=self.debug, **kwargs) # Read the layouts/dimensions of each of the pages we need to # parse. This might require creating a temporary .pdf. @@ -204,8 +207,8 @@ class PDFHandler(): ) parser.prepare_page_parse(source_file, layout, dimensions, page_idx, layout_kwargs) - rootname = os.path.basename(parser.rootname) if not suppress_stdout: + rootname = os.path.basename(parser.rootname) logger.info( "Processing {rootname}".format(rootname=rootname)) t = parser.extract_tables() diff --git a/camelot/io.py b/camelot/io.py index 6521663..58ec530 100644 --- a/camelot/io.py +++ b/camelot/io.py @@ -13,6 +13,7 @@ def read_pdf( flavor="lattice", suppress_stdout=False, layout_kwargs=None, + debug=False, **kwargs ): """Read PDF and return extracted tables. @@ -110,7 +111,7 @@ def read_pdf( warnings.simplefilter("ignore") validate_input(kwargs, flavor=flavor) - p = PDFHandler(filepath, pages=pages, password=password) + p = PDFHandler(filepath, pages=pages, password=password, debug=debug) kwargs = remove_extra(kwargs, flavor=flavor) tables = p.parse( flavor=flavor, diff --git a/camelot/parsers/base.py b/camelot/parsers/base.py index 7aa35ad..9e76c7b 100644 --- a/camelot/parsers/base.py +++ b/camelot/parsers/base.py @@ -5,7 +5,9 @@ import warnings from ..utils import ( get_text_objects, - get_table_index + get_table_index, + text_in_bbox, + bbox_from_str, ) from ..core import Table @@ -65,7 +67,39 @@ class BaseParser(object): self.debug_info["table_regions"] = self.table_regions self.debug_info["table_areas"] = self.table_areas + def _apply_regions_filter(self, textlines): + """If regions have been specified, filter textlines to these regions. + + Parameters + ---------- + textlines : list + list of textlines to be filtered + + Returns + ------- + filtered_textlines : list of textlines within the regions specified + + """ + filtered_textlines = [] + if self.table_regions is None: + filtered_textlines.extend(textlines) + else: + for region_str in self.table_regions: + region_text = text_in_bbox( + bbox_from_str(region_str), + textlines + ) + filtered_textlines.extend(region_text) + return filtered_textlines + def _document_has_no_text(self): + """Detects image only documents and warns. + + Returns + ------- + has_no_text : bool + Whether the document doesn't have any text at all. + """ if not self.horizontal_text: rootname = os.path.basename(self.rootname) if self.images: @@ -81,23 +115,23 @@ class BaseParser(object): return True return False - """Initialize new table object, ready to be populated - - Parameters - ---------- - table_idx : int - Index of this table within the pdf page analyzed - cols : list - list of coordinate boundaries tuples (left, right) - rows : list - list of coordinate boundaries tuples (bottom, top) - - Returns - ------- - table : camelot.core.Table - - """ def _initialize_new_table(self, table_idx, cols, rows): + """Initialize new table object, ready to be populated + + Parameters + ---------- + table_idx : int + Index of this table within the pdf page analyzed + cols : list + list of coordinate boundaries tuples (left, right) + rows : list + list of coordinate boundaries tuples (bottom, top) + + Returns + ------- + table : camelot.core.Table + + """ table = Table(cols, rows) table.page = self.page table.order = table_idx + 1 diff --git a/camelot/parsers/hybrid.py b/camelot/parsers/hybrid.py index 3964624..9877867 100644 --- a/camelot/parsers/hybrid.py +++ b/camelot/parsers/hybrid.py @@ -2,7 +2,7 @@ from __future__ import division import numpy as np - +import copy import warnings from .base import BaseParser @@ -459,7 +459,6 @@ class TextEdges2(object): or horizontally. There needs to be connections across both dimensions. """ - singleton_textlines = [] removed_singletons = True while removed_singletons: removed_singletons = False @@ -471,7 +470,6 @@ class TextEdges2(object): tl = te.textlines[i] alignments = self._textlines_alignments[tl] if alignments.max_h() <= 1 or alignments.max_v() <= 1: - singleton_textlines.append(tl) del te.textlines[i] removed_singletons = True self._textlines_alignments = {} @@ -612,33 +610,27 @@ class TextEdges2(object): self._register_all_text_lines(textlines) self._compute_alignment_counts() - def plotFRHAlignments(self, table, plt): + def plot_alignments(self, ax): """Displays a visualization of the alignments as currently computed. """ - fig = plt.figure() - ax = fig.add_subplot(111, aspect="equal") - img = table.get_pdf_image() - ax.imshow(img, extent=(0, table.pdf_size[0], 0, table.pdf_size[1])) - - tls_by_alignment_score = sorted( + # FRHTODO: This is too busy and doesn't plot lines + most_aligned_tl = sorted( self._textlines_alignments.keys(), key=lambda textline: self._textlines_alignments[textline].alignment_score(), reverse=True - ) + )[0] - for tl, alignments in self._textlines_alignments.items(): - color = "red" - if tl == tls_by_alignment_score[0]: - color = "blue" - ax.add_patch( - patches.Rectangle( - (tl.x0, tl.y0), - tl.x1 - tl.x0, tl.y1 - tl.y0, - color=color, - alpha=0.5 - ) + ax.add_patch( + patches.Rectangle( + (most_aligned_tl.x0, most_aligned_tl.y0), + most_aligned_tl.x1 - most_aligned_tl.x0, + most_aligned_tl.y1 - most_aligned_tl.y0, + color="red", + alpha=0.5 ) + ) + for tl, alignments in self._textlines_alignments.items(): ax.text( tl.x0 - 5, tl.y0 - 5, @@ -749,6 +741,7 @@ class Hybrid(BaseParser): edge_tol=50, row_tol=2, column_tol=0, + debug=False, **kwargs ): super().__init__( @@ -758,6 +751,7 @@ class Hybrid(BaseParser): split_text=split_text, strip_text=strip_text, flag_size=flag_size, + debug=debug ) self.columns = columns self._validate_columns() @@ -971,8 +965,7 @@ class Hybrid(BaseParser): raise ValueError("Length of table_areas and columns" " should be equal") - # FRHTODO: get debug_info to work again - def _generate_table_bbox(self, debug_info=None): + def _generate_table_bbox(self): if self.table_areas is not None: table_bbox = {} for area_str in self.table_areas: @@ -981,32 +974,30 @@ class Hybrid(BaseParser): return all_textlines = self.horizontal_text + self.vertical_text - textlines = [] - if self.table_regions is None: - textlines = all_textlines - else: - # filter text - for region_str in self.table_regions: - region_text = text_in_bbox( - bbox_from_str(region_str), - all_textlines - ) - textlines.extend(region_text) + textlines = self._apply_regions_filter(all_textlines) textlines_processed = {} self.table_bbox = {} - if debug_info is not None: - debug_info_bbox_searches = [] - debug_info["bboxes_searches"] = debug_info_bbox_searches + if self.debug_info is not None: + debug_info_edges_searches = [] + self.debug_info["edges_searches"] = debug_info_edges_searches + debug_info_bboxes_searches = [] + self.debug_info["bboxes_searches"] = debug_info_bboxes_searches else: - debug_info_bbox_searches = None + debug_info_edges_searches = None + debug_info_bboxes_searches = None while True: self.textedges = TextEdges2() self.textedges.generate(textlines) self.textedges._remove_unconnected_edges() + if debug_info_edges_searches is not None: + # Preserve the current edge calculation for display debugging + debug_info_edges_searches.append( + copy.deepcopy(self.textedges) + ) bbox = self.textedges._build_bbox_candidate( - debug_info_bbox_searches + debug_info_bboxes_searches ) if bbox is None: break @@ -1040,8 +1031,10 @@ class Hybrid(BaseParser): average_tl_height ) - if debug_info is not None: - debug_info["col_searches"].append({ + if self.debug_info is not None: + if "col_searches" not in self.debug_info: + self.debug_info["col_searches"] = [] + self.debug_info["col_searches"].append({ "core_bbox": bbox, "cols_anchors": cols_anchors, "expanded_bbox": expanded_bbox @@ -1148,13 +1141,13 @@ class Hybrid(BaseParser): return table - def extract_tables(self, debug_info=None): + def extract_tables(self): if self._document_has_no_text(): return [] # Identify plausible areas within the doc where tables lie, # populate table_bbox keys with these areas. - self._generate_table_bbox(debug_info) + self._generate_table_bbox() _tables = [] # sort tables based on y-coord diff --git a/camelot/plotting.py b/camelot/plotting.py index b9d0091..3d6303a 100644 --- a/camelot/plotting.py +++ b/camelot/plotting.py @@ -164,6 +164,20 @@ class PlotMethods(object): ax.imshow(img, extent=(0, table.pdf_size[0], 0, table.pdf_size[1])) return fig + @staticmethod + def draw_pdf(table, ax): + """Draw the content of the table's source pdf into the passed subplot + + Parameters + ---------- + table : camelot.core.Table + + fig : matplotlib.axes.Axes + + """ + img = table.get_pdf_image() + ax.imshow(img, extent=(0, table.pdf_size[0], 0, table.pdf_size[1])) + @staticmethod def textedge(table): """Generates a plot for relevant textedges. @@ -179,6 +193,7 @@ class PlotMethods(object): """ fig = plt.figure() ax = fig.add_subplot(111, aspect="equal") + PlotMethods.draw_pdf(table, ax) xs, ys = [], [] for t in table._text: xs.extend([t[0], t[2]]) @@ -193,11 +208,13 @@ class PlotMethods(object): ax.set_xlim(min(xs) - 10, max(xs) + 10) ax.set_ylim(min(ys) - 10, max(ys) + 10) - for te in table._textedges: - ax.plot([te.x, te.x], [te.y0, te.y1]) + if table.flavor == "hybrid": + # FRHTODO: Clean this up + table.debug_info["edges_searches"][0].plot_alignments(ax) + else: + for te in table._textedges: + ax.plot([te.x, te.x], [te.y0, te.y1]) - img = table.get_pdf_image() - ax.imshow(img, extent=(0, table.pdf_size[0], 0, table.pdf_size[1])) return fig @staticmethod diff --git a/tests/files/baseline_plots/test_hybrid_contour_plot.png b/tests/files/baseline_plots/test_hybrid_contour_plot.png index d781439..2757c33 100644 Binary files a/tests/files/baseline_plots/test_hybrid_contour_plot.png and b/tests/files/baseline_plots/test_hybrid_contour_plot.png differ diff --git a/tests/files/baseline_plots/test_hybrid_grid_plot.png b/tests/files/baseline_plots/test_hybrid_grid_plot.png index b04a2f1..9e68660 100644 Binary files a/tests/files/baseline_plots/test_hybrid_grid_plot.png and b/tests/files/baseline_plots/test_hybrid_grid_plot.png differ diff --git a/tests/files/baseline_plots/test_hybrid_textedge_plot.png b/tests/files/baseline_plots/test_hybrid_textedge_plot.png index 1c04473..47e3c52 100644 Binary files a/tests/files/baseline_plots/test_hybrid_textedge_plot.png and b/tests/files/baseline_plots/test_hybrid_textedge_plot.png differ diff --git a/tests/test_plotting.py b/tests/test_plotting.py index 63f29f3..c550edb 100644 --- a/tests/test_plotting.py +++ b/tests/test_plotting.py @@ -129,5 +129,5 @@ def test_stream_textedge_plot(): baseline_dir="files/baseline_plots", remove_text=True) def test_hybrid_textedge_plot(): filename = os.path.join(testdir, "tabula/12s0324.pdf") - tables = camelot.read_pdf(filename, flavor='hybrid') + tables = camelot.read_pdf(filename, debug=True, flavor='hybrid') return camelot.plot(tables[0], kind='textedge')