diff --git a/camelot/handlers.py b/camelot/handlers.py index 64b6197..08685a6 100644 --- a/camelot/handlers.py +++ b/camelot/handlers.py @@ -25,7 +25,7 @@ PARSERS = { } -class PDFHandler(object): +class PDFHandler(): """Handles all operations like temp directory creation, splitting file into single page PDFs, parsing each PDF and then removing the temp directory. @@ -201,8 +201,8 @@ class PDFHandler(object): page_idx, layout_kwargs=layout_kwargs ) - parser._generate_layout(source_file, layout, dimensions, - page_idx, layout_kwargs) + parser.prepare_page_parse(source_file, layout, dimensions, + page_idx, layout_kwargs) rootname = os.path.basename(parser.rootname) if not suppress_stdout: logger.info( diff --git a/camelot/parsers/base.py b/camelot/parsers/base.py index b364f04..921a118 100644 --- a/camelot/parsers/base.py +++ b/camelot/parsers/base.py @@ -1,6 +1,7 @@ # -*- coding: utf-8 -*- import os +import warnings from ..utils import ( get_text_objects, @@ -40,8 +41,8 @@ class BaseParser(object): # For plotting details of parsing algorithms self.debug_info = {} - def _generate_layout(self, filename, layout, dimensions, - page_idx, layout_kwargs): + def prepare_page_parse(self, filename, layout, dimensions, + page_idx, layout_kwargs): self.filename = filename self.layout_kwargs = layout_kwargs self.layout = layout @@ -59,6 +60,22 @@ class BaseParser(object): self.pdf_width, self.pdf_height = self.dimensions self.rootname, __ = os.path.splitext(self.filename) + def _document_has_no_text(self): + if not self.horizontal_text: + rootname = os.path.basename(self.rootname) + if self.images: + warnings.warn( + "{rootname} is image-based, " + "camelot only works on text-based pages." + .format(rootname=rootname) + ) + else: + warnings.warn( + "No tables found on {rootname}".format(rootname=rootname) + ) + return True + return False + """Initialize new table object, ready to be populated Parameters diff --git a/camelot/parsers/lattice.py b/camelot/parsers/lattice.py index cefc27f..66dd98c 100644 --- a/camelot/parsers/lattice.py +++ b/camelot/parsers/lattice.py @@ -3,7 +3,6 @@ from __future__ import division import os import copy -import warnings from .base import BaseParser @@ -312,18 +311,7 @@ class Lattice(BaseParser): return table def extract_tables(self, filename): - rootname = os.path.basename(self.rootname) - if not self.horizontal_text: - if self.images: - warnings.warn( - "{rootname} is image-based, " - "camelot only works on text-based pages." - .format(rootname=rootname) - ) - else: - warnings.warn( - "No tables found on {rootname}".format(rootname=rootname) - ) + if self._document_has_no_text(): return [] self._generate_table_bbox() diff --git a/camelot/parsers/stream.py b/camelot/parsers/stream.py index 351d7d3..2badb39 100644 --- a/camelot/parsers/stream.py +++ b/camelot/parsers/stream.py @@ -1,7 +1,6 @@ # -*- coding: utf-8 -*- from __future__ import division -import os import warnings import numpy as np @@ -427,18 +426,7 @@ class Stream(BaseParser): return table def extract_tables(self, filename): - if not self.horizontal_text: - if self.images: - warnings.warn( - "{} is image-based, camelot only works on" - " text-based pages.".format( - os.path.basename(self.rootname)) - ) - else: - warnings.warn( - "No tables found on {}".format( - os.path.basename(self.rootname)) - ) + if self._document_has_no_text(): return [] # Identify plausible areas within the doc where tables lie, diff --git a/camelot/utils.py b/camelot/utils.py index 2c66e7c..e6f8e50 100644 --- a/camelot/utils.py +++ b/camelot/utils.py @@ -1107,11 +1107,15 @@ def compare_tables(left, right): diff_df = diff_df.append(lrow, ignore_index=True) diff_df = diff_df.append(srow, ignore_index=True) diff_df.insert(0, 'Table', [name_table1, name_table2]) - print(f"Row {index} differs:") + print("Row {index} differs:".format(index=index)) print(diff_df.values) break else: - print(f"Row {index} unique to {name_table1}: {lrow}") + print("Row {index} unique to {name_table1}: {lrow}".format( + index=index, + name_table1=name_table1, + lrow=lrow + )) break else: print("Tables have different shapes")