From 5c5bd6199c0a58103fc80551ba055795b0211f4e Mon Sep 17 00:00:00 2001 From: Vinayak Mehta Date: Fri, 21 Apr 2017 14:20:33 +0530 Subject: [PATCH] Fix warnings and exceptions --- camelot/lattice.py | 11 ++++++----- camelot/ocr.py | 10 +++++++++- camelot/pdf.py | 14 +++++--------- camelot/stream.py | 16 ++++++++-------- camelot/utils.py | 5 ++--- 5 files changed, 30 insertions(+), 26 deletions(-) diff --git a/camelot/lattice.py b/camelot/lattice.py index 17b1170..7652ad0 100644 --- a/camelot/lattice.py +++ b/camelot/lattice.py @@ -5,6 +5,7 @@ import copy import types import logging import copy_reg +import warnings import subprocess from .imgproc import (adaptive_threshold, find_lines, find_table_contours, @@ -16,8 +17,7 @@ from .utils import (scale_to_pdf, scale_to_image, segments_bbox, text_in_bbox, __all__ = ['Lattice'] - -logger = logging.getLogger("app_logger") +logger = logging.getLogger('app_logger') def _reduce_method(m): @@ -231,9 +231,9 @@ class Lattice: ltchar = get_text_objects(layout, ltype="char") width, height = dim bname, __ = os.path.splitext(pdfname) - logger.info('Parsing tables from {0}.'.format(os.path.basename(bname))) + logger.info('Processing {0}.'.format(os.path.basename(bname))) if not ltchar: - logger.warning("{0}: PDF has no text. It may be an image.".format( + warnings.warn("{0}: Page contains no text.".format( os.path.basename(bname))) return {os.path.basename(bname): None} @@ -269,7 +269,8 @@ class Lattice: if self.table_area is not None: if self.fill is not None: if len(self.table_area) != len(self.fill): - raise ValueError("Length of table area and fill should be equal.") + raise ValueError("{0}: Length of table area and fill should" + " be equal.".format(os.path.basename(bname))) areas = [] for area in self.table_area: diff --git a/camelot/ocr.py b/camelot/ocr.py index 4be2bb5..6be7c26 100644 --- a/camelot/ocr.py +++ b/camelot/ocr.py @@ -1,5 +1,6 @@ import os import copy +import logging import subprocess import pyocr @@ -11,6 +12,10 @@ from .imgproc import (adaptive_threshold, find_lines, find_table_contours, from .utils import merge_close_values, encode_list +__all__ = ['OCRLattice', 'OCRStream'] +logger = logging.getLogger('app_logger') + + class OCRLattice: """Lattice, but for images. @@ -81,6 +86,7 @@ class OCRLattice: bname, __ = os.path.splitext(pdfname) imagename = ''.join([bname, '.png']) + logger.info('Processing {0}.'.format(os.path.basename(bname))) gs_call = [ "-q", "-sDEVICE=png16m", "-o", imagename, "-r{0}".format(self.dpi), @@ -230,6 +236,7 @@ class OCRStream: bname, __ = os.path.splitext(pdfname) imagename = ''.join([bname, '.png']) + logger.info('Processing {0}.'.format(os.path.basename(bname))) gs_call = [ "-q", "-sDEVICE=png16m", "-o", imagename, "-r{0}".format(self.dpi), @@ -252,7 +259,8 @@ class OCRStream: if self.table_area is not None: if self.columns is not None: if len(self.table_area) != len(self.columns): - raise ValueError("Length of table area and columns should be equal.") + raise ValueError("{0}: Length of table area and columns" + "should be equal.".format(os.path.basename(bname))) table_bbox = {} for area in self.table_area: diff --git a/camelot/pdf.py b/camelot/pdf.py index 7c6ae0f..08fd26c 100644 --- a/camelot/pdf.py +++ b/camelot/pdf.py @@ -1,6 +1,5 @@ import os import shutil -import logging import tempfile import itertools import multiprocessing as mp @@ -14,8 +13,6 @@ from .utils import get_page_layout, get_text_objects, get_rotation __all__ = ['Pdf'] -logger = logging.getLogger("app_logger") - def _parse_page_numbers(pagenos): """Converts list of dicts to list of ints. @@ -104,7 +101,7 @@ class Pdf: self.extractor = extractor self.pdfname = pdfname if not self.pdfname.endswith('.pdf'): - raise TypeError("Only PDF format is supported right now.") + raise TypeError("File format not supported.") self.pagenos = _parse_page_numbers(pagenos) self.parallel = parallel if self.parallel: @@ -116,7 +113,6 @@ class Pdf: def split(self): """Splits file into single page pdfs. """ - logger.info('Splitting pages...') if self.parallel: pfunc = partial(_save_page, self.temp, self.pdfname) self.pool.map(pfunc, self.pagenos) @@ -211,7 +207,7 @@ class Pdf: plt.imshow(img) plt.show() except AttributeError: - raise ValueError("This option only be used with Lattice.") + raise ValueError("This option can only be used with Lattice.") elif self.debug == 'joint': try: for img, table_bbox in self.debug_images: @@ -227,7 +223,7 @@ class Pdf: plt.imshow(img) plt.show() except AttributeError: - raise ValueError("This option only be used with Lattice.") + raise ValueError("This option can only be used with Lattice.") elif self.debug == 'line': try: for v_s, h_s in self.debug_segments: @@ -237,7 +233,7 @@ class Pdf: plt.plot([h[0], h[2]], [h[1], h[3]]) plt.show() except AttributeError: - raise ValueError("This option only be used with Lattice.") + raise ValueError("This option can only be used with Lattice.") elif self.debug == 'table': try: for tables in self.debug_tables: @@ -266,7 +262,7 @@ class Pdf: table.cells[r][c].rb[1]]) plt.show() except AttributeError: - raise ValueError("This option only be used with Lattice.") + raise ValueError("This option can only be used with Lattice.") else: raise UserWarning("This method can only be called after" " debug has been specified.") \ No newline at end of file diff --git a/camelot/stream.py b/camelot/stream.py index 96dfedd..2b6948a 100644 --- a/camelot/stream.py +++ b/camelot/stream.py @@ -4,6 +4,7 @@ import copy import types import logging import copy_reg +import warnings import numpy as np @@ -13,8 +14,7 @@ from .utils import (text_in_bbox, get_table_index, get_score, count_empty, __all__ = ['Stream'] - -logger = logging.getLogger("app_logger") +logger = logging.getLogger('app_logger') def _reduce_method(m): @@ -297,9 +297,9 @@ class Stream: ltchar = get_text_objects(layout, ltype="char") width, height = dim bname, __ = os.path.splitext(pdfname) - logger.info('Parsing tables from {0}.'.format(os.path.basename(bname))) + logger.info('Processing {0}.'.format(os.path.basename(bname))) if not lttextlh: - logger.warning("{0}: PDF has no text. It may be an image.".format( + warnings.warn("{0}: Page contains no text.".format( os.path.basename(bname))) return {os.path.basename(bname): None} @@ -312,7 +312,8 @@ class Stream: if self.table_area is not None: if self.columns is not None: if len(self.table_area) != len(self.columns): - raise ValueError("Length of table area and columns should be equal.") + raise ValueError("{0}: Length of table area and columns" + "should be equal.".format(os.path.basename(bname))) table_bbox = {} for area in self.table_area: @@ -370,9 +371,8 @@ class Stream: len_non_mode = len(filter(lambda x: x != ncols, elements)) if ncols == 1: # no tables detected - logger.warning("{}: Only one column was detected, the pdf" - " may have no tables.".format( - os.path.basename(bname))) + warnings.warn("{0}: Page contains no tables.".format( + os.path.basename(bname))) cols = [(t.x0, t.x1) for r in rows_grouped if len(r) == ncols for t in r] cols = _merge_columns(sorted(cols), mtol=mtolerance[table_no]) diff --git a/camelot/utils.py b/camelot/utils.py index c5e958c..e209070 100644 --- a/camelot/utils.py +++ b/camelot/utils.py @@ -528,7 +528,7 @@ def get_table_index(table, t, direction, split_text=False, flag_size=True): else: lt_col_overlap.append(-1) if len(filter(lambda x: x != -1, lt_col_overlap)) == 0: - logging.warning("Text doesn't fit any column.") + logging.warning("Text did not fit any column.") r_idx = r c_idx = lt_col_overlap.index(max(lt_col_overlap)) break @@ -576,8 +576,7 @@ def get_score(error_weights): try: score = 0 if sum([ew[0] for ew in error_weights]) != SCORE_VAL: - raise ValueError("Please assign a valid weightage to each parameter" - " such that their sum is equal to 100") + raise ValueError("Sum of weights should be equal to 100.") for ew in error_weights: weight = ew[0] / len(ew[1]) for error_percentage in ew[1]: