From 5c5bd6199c0a58103fc80551ba055795b0211f4e Mon Sep 17 00:00:00 2001
From: Vinayak Mehta <vmehta94@gmail.com>
Date: Fri, 21 Apr 2017 14:20:33 +0530
Subject: [PATCH] Fix warnings and exceptions

---
 camelot/lattice.py | 11 ++++++-----
 camelot/ocr.py     | 10 +++++++++-
 camelot/pdf.py     | 14 +++++---------
 camelot/stream.py  | 16 ++++++++--------
 camelot/utils.py   |  5 ++---
 5 files changed, 30 insertions(+), 26 deletions(-)

diff --git a/camelot/lattice.py b/camelot/lattice.py
index 17b1170..7652ad0 100644
--- a/camelot/lattice.py
+++ b/camelot/lattice.py
@@ -5,6 +5,7 @@ import copy
 import types
 import logging
 import copy_reg
+import warnings
 import subprocess
 
 from .imgproc import (adaptive_threshold, find_lines, find_table_contours,
@@ -16,8 +17,7 @@ from .utils import (scale_to_pdf, scale_to_image, segments_bbox, text_in_bbox,
 
 
 __all__ = ['Lattice']
-
-logger = logging.getLogger("app_logger")
+logger = logging.getLogger('app_logger')
 
 
 def _reduce_method(m):
@@ -231,9 +231,9 @@ class Lattice:
         ltchar = get_text_objects(layout, ltype="char")
         width, height = dim
         bname, __ = os.path.splitext(pdfname)
-        logger.info('Parsing tables from {0}.'.format(os.path.basename(bname)))
+        logger.info('Processing {0}.'.format(os.path.basename(bname)))
         if not ltchar:
-            logger.warning("{0}: PDF has no text. It may be an image.".format(
+            warnings.warn("{0}: Page contains no text.".format(
                 os.path.basename(bname)))
             return {os.path.basename(bname): None}
 
@@ -269,7 +269,8 @@ class Lattice:
         if self.table_area is not None:
             if self.fill is not None:
                 if len(self.table_area) != len(self.fill):
-                    raise ValueError("Length of table area and fill should be equal.")
+                    raise ValueError("{0}: Length of table area and fill should"
+                                     " be equal.".format(os.path.basename(bname)))
 
             areas = []
             for area in self.table_area:
diff --git a/camelot/ocr.py b/camelot/ocr.py
index 4be2bb5..6be7c26 100644
--- a/camelot/ocr.py
+++ b/camelot/ocr.py
@@ -1,5 +1,6 @@
 import os
 import copy
+import logging
 import subprocess
 
 import pyocr
@@ -11,6 +12,10 @@ from .imgproc import (adaptive_threshold, find_lines, find_table_contours,
 from .utils import merge_close_values, encode_list
 
 
+__all__ = ['OCRLattice', 'OCRStream']
+logger = logging.getLogger('app_logger')
+
+
 class OCRLattice:
     """Lattice, but for images.
 
@@ -81,6 +86,7 @@ class OCRLattice:
 
         bname, __ = os.path.splitext(pdfname)
         imagename = ''.join([bname, '.png'])
+        logger.info('Processing {0}.'.format(os.path.basename(bname)))
 
         gs_call = [
             "-q", "-sDEVICE=png16m", "-o", imagename, "-r{0}".format(self.dpi),
@@ -230,6 +236,7 @@ class OCRStream:
 
         bname, __ = os.path.splitext(pdfname)
         imagename = ''.join([bname, '.png'])
+        logger.info('Processing {0}.'.format(os.path.basename(bname)))
 
         gs_call = [
             "-q", "-sDEVICE=png16m", "-o", imagename, "-r{0}".format(self.dpi),
@@ -252,7 +259,8 @@ class OCRStream:
         if self.table_area is not None:
             if self.columns is not None:
                 if len(self.table_area) != len(self.columns):
-                    raise ValueError("Length of table area and columns should be equal.")
+                    raise ValueError("{0}: Length of table area and columns"
+                                     "should be equal.".format(os.path.basename(bname)))
 
             table_bbox = {}
             for area in self.table_area:
diff --git a/camelot/pdf.py b/camelot/pdf.py
index 7c6ae0f..08fd26c 100644
--- a/camelot/pdf.py
+++ b/camelot/pdf.py
@@ -1,6 +1,5 @@
 import os
 import shutil
-import logging
 import tempfile
 import itertools
 import multiprocessing as mp
@@ -14,8 +13,6 @@ from .utils import get_page_layout, get_text_objects, get_rotation
 
 __all__ = ['Pdf']
 
-logger = logging.getLogger("app_logger")
-
 
 def _parse_page_numbers(pagenos):
     """Converts list of dicts to list of ints.
@@ -104,7 +101,7 @@ class Pdf:
         self.extractor = extractor
         self.pdfname = pdfname
         if not self.pdfname.endswith('.pdf'):
-            raise TypeError("Only PDF format is supported right now.")
+            raise TypeError("File format not supported.")
         self.pagenos = _parse_page_numbers(pagenos)
         self.parallel = parallel
         if self.parallel:
@@ -116,7 +113,6 @@ class Pdf:
     def split(self):
         """Splits file into single page pdfs.
         """
-        logger.info('Splitting pages...')
         if self.parallel:
             pfunc = partial(_save_page, self.temp, self.pdfname)
             self.pool.map(pfunc, self.pagenos)
@@ -211,7 +207,7 @@ class Pdf:
                     plt.imshow(img)
                     plt.show()
             except AttributeError:
-                raise ValueError("This option only be used with Lattice.")
+                raise ValueError("This option can only be used with Lattice.")
         elif self.debug == 'joint':
             try:
                 for img, table_bbox in self.debug_images:
@@ -227,7 +223,7 @@ class Pdf:
                     plt.imshow(img)
                     plt.show()
             except AttributeError:
-                raise ValueError("This option only be used with Lattice.")
+                raise ValueError("This option can only be used with Lattice.")
         elif self.debug == 'line':
             try:
                 for v_s, h_s in self.debug_segments:
@@ -237,7 +233,7 @@ class Pdf:
                         plt.plot([h[0], h[2]], [h[1], h[3]])
                     plt.show()
             except AttributeError:
-                raise ValueError("This option only be used with Lattice.")
+                raise ValueError("This option can only be used with Lattice.")
         elif self.debug == 'table':
             try:
                 for tables in self.debug_tables:
@@ -266,7 +262,7 @@ class Pdf:
                                               table.cells[r][c].rb[1]])
                     plt.show()
             except AttributeError:
-                raise ValueError("This option only be used with Lattice.")
+                raise ValueError("This option can only be used with Lattice.")
         else:
             raise UserWarning("This method can only be called after"
                 " debug has been specified.")
\ No newline at end of file
diff --git a/camelot/stream.py b/camelot/stream.py
index 96dfedd..2b6948a 100644
--- a/camelot/stream.py
+++ b/camelot/stream.py
@@ -4,6 +4,7 @@ import copy
 import types
 import logging
 import copy_reg
+import warnings
 
 import numpy as np
 
@@ -13,8 +14,7 @@ from .utils import (text_in_bbox, get_table_index, get_score, count_empty,
 
 
 __all__ = ['Stream']
-
-logger = logging.getLogger("app_logger")
+logger = logging.getLogger('app_logger')
 
 
 def _reduce_method(m):
@@ -297,9 +297,9 @@ class Stream:
         ltchar = get_text_objects(layout, ltype="char")
         width, height = dim
         bname, __ = os.path.splitext(pdfname)
-        logger.info('Parsing tables from {0}.'.format(os.path.basename(bname)))
+        logger.info('Processing {0}.'.format(os.path.basename(bname)))
         if not lttextlh:
-            logger.warning("{0}: PDF has no text. It may be an image.".format(
+            warnings.warn("{0}: Page contains no text.".format(
                 os.path.basename(bname)))
             return {os.path.basename(bname): None}
 
@@ -312,7 +312,8 @@ class Stream:
         if self.table_area is not None:
             if self.columns is not None:
                 if len(self.table_area) != len(self.columns):
-                    raise ValueError("Length of table area and columns should be equal.")
+                    raise ValueError("{0}: Length of table area and columns"
+                                     "should be equal.".format(os.path.basename(bname)))
 
             table_bbox = {}
             for area in self.table_area:
@@ -370,9 +371,8 @@ class Stream:
                 len_non_mode = len(filter(lambda x: x != ncols, elements))
                 if ncols == 1:
                     # no tables detected
-                    logger.warning("{}: Only one column was detected, the pdf"
-                                   " may have no tables.".format(
-                                  os.path.basename(bname)))
+                    warnings.warn("{0}: Page contains no tables.".format(
+                        os.path.basename(bname)))
                 cols = [(t.x0, t.x1)
                     for r in rows_grouped if len(r) == ncols for t in r]
                 cols = _merge_columns(sorted(cols), mtol=mtolerance[table_no])
diff --git a/camelot/utils.py b/camelot/utils.py
index c5e958c..e209070 100644
--- a/camelot/utils.py
+++ b/camelot/utils.py
@@ -528,7 +528,7 @@ def get_table_index(table, t, direction, split_text=False, flag_size=True):
                 else:
                     lt_col_overlap.append(-1)
             if len(filter(lambda x: x != -1, lt_col_overlap)) == 0:
-                logging.warning("Text doesn't fit any column.")
+                logging.warning("Text did not fit any column.")
             r_idx = r
             c_idx = lt_col_overlap.index(max(lt_col_overlap))
             break
@@ -576,8 +576,7 @@ def get_score(error_weights):
     try:
         score = 0
         if sum([ew[0] for ew in error_weights]) != SCORE_VAL:
-            raise ValueError("Please assign a valid weightage to each parameter"
-                             " such that their sum is equal to 100")
+            raise ValueError("Sum of weights should be equal to 100.")
         for ew in error_weights:
             weight = ew[0] / len(ew[1])
             for error_percentage in ew[1]: