Fix warnings and exceptions

2017-04-21 14:20:33 +05:30 · 2017-04-21 14:20:33 +05:30 · 5c5bd6199c
parent 18e1a799a1
commit 5c5bd6199c
5 changed files with 30 additions and 26 deletions
--- a/camelot/lattice.py
+++ b/camelot/lattice.py
@ -5,6 +5,7 @@ import copy
 import types
 import logging
 import copy_reg
+import warnings
 import subprocess

 from .imgproc import (adaptive_threshold, find_lines, find_table_contours,
@ -16,8 +17,7 @@ from .utils import (scale_to_pdf, scale_to_image, segments_bbox, text_in_bbox,


 __all__ = ['Lattice']
-
-logger = logging.getLogger("app_logger")
+logger = logging.getLogger('app_logger')


 def _reduce_method(m):
@ -231,9 +231,9 @@ class Lattice:
        ltchar = get_text_objects(layout, ltype="char")
        width, height = dim
        bname, __ = os.path.splitext(pdfname)
-        logger.info('Parsing tables from {0}.'.format(os.path.basename(bname)))
+        logger.info('Processing {0}.'.format(os.path.basename(bname)))
        if not ltchar:
-            logger.warning("{0}: PDF has no text. It may be an image.".format(
+            warnings.warn("{0}: Page contains no text.".format(
                os.path.basename(bname)))
            return {os.path.basename(bname): None}

@ -269,7 +269,8 @@ class Lattice:
        if self.table_area is not None:
            if self.fill is not None:
                if len(self.table_area) != len(self.fill):
-                    raise ValueError("Length of table area and fill should be equal.")
+                    raise ValueError("{0}: Length of table area and fill should"
+                                     " be equal.".format(os.path.basename(bname)))

            areas = []
            for area in self.table_area:
--- a/camelot/ocr.py
+++ b/camelot/ocr.py
@ -1,5 +1,6 @@
 import os
 import copy
+import logging
 import subprocess

 import pyocr
@ -11,6 +12,10 @@ from .imgproc import (adaptive_threshold, find_lines, find_table_contours,
 from .utils import merge_close_values, encode_list


+__all__ = ['OCRLattice', 'OCRStream']
+logger = logging.getLogger('app_logger')
+
+
 class OCRLattice:
    """Lattice, but for images.

@ -81,6 +86,7 @@ class OCRLattice:

        bname, __ = os.path.splitext(pdfname)
        imagename = ''.join([bname, '.png'])
+        logger.info('Processing {0}.'.format(os.path.basename(bname)))

        gs_call = [
            "-q", "-sDEVICE=png16m", "-o", imagename, "-r{0}".format(self.dpi),
@ -230,6 +236,7 @@ class OCRStream:

        bname, __ = os.path.splitext(pdfname)
        imagename = ''.join([bname, '.png'])
+        logger.info('Processing {0}.'.format(os.path.basename(bname)))

        gs_call = [
            "-q", "-sDEVICE=png16m", "-o", imagename, "-r{0}".format(self.dpi),
@ -252,7 +259,8 @@ class OCRStream:
        if self.table_area is not None:
            if self.columns is not None:
                if len(self.table_area) != len(self.columns):
-                    raise ValueError("Length of table area and columns should be equal.")
+                    raise ValueError("{0}: Length of table area and columns"
+                                     "should be equal.".format(os.path.basename(bname)))

            table_bbox = {}
            for area in self.table_area:
--- a/camelot/pdf.py
+++ b/camelot/pdf.py
@ -1,6 +1,5 @@
 import os
 import shutil
-import logging
 import tempfile
 import itertools
 import multiprocessing as mp
@ -14,8 +13,6 @@ from .utils import get_page_layout, get_text_objects, get_rotation

 __all__ = ['Pdf']

-logger = logging.getLogger("app_logger")
-

 def _parse_page_numbers(pagenos):
    """Converts list of dicts to list of ints.
@ -104,7 +101,7 @@ class Pdf:
        self.extractor = extractor
        self.pdfname = pdfname
        if not self.pdfname.endswith('.pdf'):
-            raise TypeError("Only PDF format is supported right now.")
+            raise TypeError("File format not supported.")
        self.pagenos = _parse_page_numbers(pagenos)
        self.parallel = parallel
        if self.parallel:
@ -116,7 +113,6 @@ class Pdf:
    def split(self):
        """Splits file into single page pdfs.
        """
-        logger.info('Splitting pages...')
        if self.parallel:
            pfunc = partial(_save_page, self.temp, self.pdfname)
            self.pool.map(pfunc, self.pagenos)
@ -211,7 +207,7 @@ class Pdf:
                    plt.imshow(img)
                    plt.show()
            except AttributeError:
-                raise ValueError("This option only be used with Lattice.")
+                raise ValueError("This option can only be used with Lattice.")
        elif self.debug == 'joint':
            try:
                for img, table_bbox in self.debug_images:
@ -227,7 +223,7 @@ class Pdf:
                    plt.imshow(img)
                    plt.show()
            except AttributeError:
-                raise ValueError("This option only be used with Lattice.")
+                raise ValueError("This option can only be used with Lattice.")
        elif self.debug == 'line':
            try:
                for v_s, h_s in self.debug_segments:
@ -237,7 +233,7 @@ class Pdf:
                        plt.plot([h[0], h[2]], [h[1], h[3]])
                    plt.show()
            except AttributeError:
-                raise ValueError("This option only be used with Lattice.")
+                raise ValueError("This option can only be used with Lattice.")
        elif self.debug == 'table':
            try:
                for tables in self.debug_tables:
@ -266,7 +262,7 @@ class Pdf:
                                              table.cells[r][c].rb[1]])
                    plt.show()
            except AttributeError:
-                raise ValueError("This option only be used with Lattice.")
+                raise ValueError("This option can only be used with Lattice.")
        else:
            raise UserWarning("This method can only be called after"
                " debug has been specified.")
--- a/camelot/stream.py
+++ b/camelot/stream.py
@ -4,6 +4,7 @@ import copy
 import types
 import logging
 import copy_reg
+import warnings

 import numpy as np

@ -13,8 +14,7 @@ from .utils import (text_in_bbox, get_table_index, get_score, count_empty,


 __all__ = ['Stream']
-
-logger = logging.getLogger("app_logger")
+logger = logging.getLogger('app_logger')


 def _reduce_method(m):
@ -297,9 +297,9 @@ class Stream:
        ltchar = get_text_objects(layout, ltype="char")
        width, height = dim
        bname, __ = os.path.splitext(pdfname)
-        logger.info('Parsing tables from {0}.'.format(os.path.basename(bname)))
+        logger.info('Processing {0}.'.format(os.path.basename(bname)))
        if not lttextlh:
-            logger.warning("{0}: PDF has no text. It may be an image.".format(
+            warnings.warn("{0}: Page contains no text.".format(
                os.path.basename(bname)))
            return {os.path.basename(bname): None}

@ -312,7 +312,8 @@ class Stream:
        if self.table_area is not None:
            if self.columns is not None:
                if len(self.table_area) != len(self.columns):
-                    raise ValueError("Length of table area and columns should be equal.")
+                    raise ValueError("{0}: Length of table area and columns"
+                                     "should be equal.".format(os.path.basename(bname)))

            table_bbox = {}
            for area in self.table_area:
@ -370,9 +371,8 @@ class Stream:
                len_non_mode = len(filter(lambda x: x != ncols, elements))
                if ncols == 1:
                    # no tables detected
-                    logger.warning("{}: Only one column was detected, the pdf"
-                                   " may have no tables.".format(
-                                  os.path.basename(bname)))
+                    warnings.warn("{0}: Page contains no tables.".format(
+                        os.path.basename(bname)))
                cols = [(t.x0, t.x1)
                    for r in rows_grouped if len(r) == ncols for t in r]
                cols = _merge_columns(sorted(cols), mtol=mtolerance[table_no])
--- a/camelot/utils.py
+++ b/camelot/utils.py
@ -528,7 +528,7 @@ def get_table_index(table, t, direction, split_text=False, flag_size=True):
                else:
                    lt_col_overlap.append(-1)
            if len(filter(lambda x: x != -1, lt_col_overlap)) == 0:
-                logging.warning("Text doesn't fit any column.")
+                logging.warning("Text did not fit any column.")
            r_idx = r
            c_idx = lt_col_overlap.index(max(lt_col_overlap))
            break
@ -576,8 +576,7 @@ def get_score(error_weights):
    try:
        score = 0
        if sum([ew[0] for ew in error_weights]) != SCORE_VAL:
-            raise ValueError("Please assign a valid weightage to each parameter"
-                             " such that their sum is equal to 100")
+            raise ValueError("Sum of weights should be equal to 100.")
        for ew in error_weights:
            weight = ew[0] / len(ew[1])
            for error_percentage in ew[1]: