Fix warnings and exceptions

pull/2/head
Vinayak Mehta 2017-04-21 14:20:33 +05:30
parent 18e1a799a1
commit 5c5bd6199c
5 changed files with 30 additions and 26 deletions

View File

@ -5,6 +5,7 @@ import copy
import types import types
import logging import logging
import copy_reg import copy_reg
import warnings
import subprocess import subprocess
from .imgproc import (adaptive_threshold, find_lines, find_table_contours, from .imgproc import (adaptive_threshold, find_lines, find_table_contours,
@ -16,8 +17,7 @@ from .utils import (scale_to_pdf, scale_to_image, segments_bbox, text_in_bbox,
__all__ = ['Lattice'] __all__ = ['Lattice']
logger = logging.getLogger('app_logger')
logger = logging.getLogger("app_logger")
def _reduce_method(m): def _reduce_method(m):
@ -231,9 +231,9 @@ class Lattice:
ltchar = get_text_objects(layout, ltype="char") ltchar = get_text_objects(layout, ltype="char")
width, height = dim width, height = dim
bname, __ = os.path.splitext(pdfname) bname, __ = os.path.splitext(pdfname)
logger.info('Parsing tables from {0}.'.format(os.path.basename(bname))) logger.info('Processing {0}.'.format(os.path.basename(bname)))
if not ltchar: if not ltchar:
logger.warning("{0}: PDF has no text. It may be an image.".format( warnings.warn("{0}: Page contains no text.".format(
os.path.basename(bname))) os.path.basename(bname)))
return {os.path.basename(bname): None} return {os.path.basename(bname): None}
@ -269,7 +269,8 @@ class Lattice:
if self.table_area is not None: if self.table_area is not None:
if self.fill is not None: if self.fill is not None:
if len(self.table_area) != len(self.fill): if len(self.table_area) != len(self.fill):
raise ValueError("Length of table area and fill should be equal.") raise ValueError("{0}: Length of table area and fill should"
" be equal.".format(os.path.basename(bname)))
areas = [] areas = []
for area in self.table_area: for area in self.table_area:

View File

@ -1,5 +1,6 @@
import os import os
import copy import copy
import logging
import subprocess import subprocess
import pyocr import pyocr
@ -11,6 +12,10 @@ from .imgproc import (adaptive_threshold, find_lines, find_table_contours,
from .utils import merge_close_values, encode_list from .utils import merge_close_values, encode_list
__all__ = ['OCRLattice', 'OCRStream']
logger = logging.getLogger('app_logger')
class OCRLattice: class OCRLattice:
"""Lattice, but for images. """Lattice, but for images.
@ -81,6 +86,7 @@ class OCRLattice:
bname, __ = os.path.splitext(pdfname) bname, __ = os.path.splitext(pdfname)
imagename = ''.join([bname, '.png']) imagename = ''.join([bname, '.png'])
logger.info('Processing {0}.'.format(os.path.basename(bname)))
gs_call = [ gs_call = [
"-q", "-sDEVICE=png16m", "-o", imagename, "-r{0}".format(self.dpi), "-q", "-sDEVICE=png16m", "-o", imagename, "-r{0}".format(self.dpi),
@ -230,6 +236,7 @@ class OCRStream:
bname, __ = os.path.splitext(pdfname) bname, __ = os.path.splitext(pdfname)
imagename = ''.join([bname, '.png']) imagename = ''.join([bname, '.png'])
logger.info('Processing {0}.'.format(os.path.basename(bname)))
gs_call = [ gs_call = [
"-q", "-sDEVICE=png16m", "-o", imagename, "-r{0}".format(self.dpi), "-q", "-sDEVICE=png16m", "-o", imagename, "-r{0}".format(self.dpi),
@ -252,7 +259,8 @@ class OCRStream:
if self.table_area is not None: if self.table_area is not None:
if self.columns is not None: if self.columns is not None:
if len(self.table_area) != len(self.columns): if len(self.table_area) != len(self.columns):
raise ValueError("Length of table area and columns should be equal.") raise ValueError("{0}: Length of table area and columns"
"should be equal.".format(os.path.basename(bname)))
table_bbox = {} table_bbox = {}
for area in self.table_area: for area in self.table_area:

View File

@ -1,6 +1,5 @@
import os import os
import shutil import shutil
import logging
import tempfile import tempfile
import itertools import itertools
import multiprocessing as mp import multiprocessing as mp
@ -14,8 +13,6 @@ from .utils import get_page_layout, get_text_objects, get_rotation
__all__ = ['Pdf'] __all__ = ['Pdf']
logger = logging.getLogger("app_logger")
def _parse_page_numbers(pagenos): def _parse_page_numbers(pagenos):
"""Converts list of dicts to list of ints. """Converts list of dicts to list of ints.
@ -104,7 +101,7 @@ class Pdf:
self.extractor = extractor self.extractor = extractor
self.pdfname = pdfname self.pdfname = pdfname
if not self.pdfname.endswith('.pdf'): if not self.pdfname.endswith('.pdf'):
raise TypeError("Only PDF format is supported right now.") raise TypeError("File format not supported.")
self.pagenos = _parse_page_numbers(pagenos) self.pagenos = _parse_page_numbers(pagenos)
self.parallel = parallel self.parallel = parallel
if self.parallel: if self.parallel:
@ -116,7 +113,6 @@ class Pdf:
def split(self): def split(self):
"""Splits file into single page pdfs. """Splits file into single page pdfs.
""" """
logger.info('Splitting pages...')
if self.parallel: if self.parallel:
pfunc = partial(_save_page, self.temp, self.pdfname) pfunc = partial(_save_page, self.temp, self.pdfname)
self.pool.map(pfunc, self.pagenos) self.pool.map(pfunc, self.pagenos)
@ -211,7 +207,7 @@ class Pdf:
plt.imshow(img) plt.imshow(img)
plt.show() plt.show()
except AttributeError: except AttributeError:
raise ValueError("This option only be used with Lattice.") raise ValueError("This option can only be used with Lattice.")
elif self.debug == 'joint': elif self.debug == 'joint':
try: try:
for img, table_bbox in self.debug_images: for img, table_bbox in self.debug_images:
@ -227,7 +223,7 @@ class Pdf:
plt.imshow(img) plt.imshow(img)
plt.show() plt.show()
except AttributeError: except AttributeError:
raise ValueError("This option only be used with Lattice.") raise ValueError("This option can only be used with Lattice.")
elif self.debug == 'line': elif self.debug == 'line':
try: try:
for v_s, h_s in self.debug_segments: for v_s, h_s in self.debug_segments:
@ -237,7 +233,7 @@ class Pdf:
plt.plot([h[0], h[2]], [h[1], h[3]]) plt.plot([h[0], h[2]], [h[1], h[3]])
plt.show() plt.show()
except AttributeError: except AttributeError:
raise ValueError("This option only be used with Lattice.") raise ValueError("This option can only be used with Lattice.")
elif self.debug == 'table': elif self.debug == 'table':
try: try:
for tables in self.debug_tables: for tables in self.debug_tables:
@ -266,7 +262,7 @@ class Pdf:
table.cells[r][c].rb[1]]) table.cells[r][c].rb[1]])
plt.show() plt.show()
except AttributeError: except AttributeError:
raise ValueError("This option only be used with Lattice.") raise ValueError("This option can only be used with Lattice.")
else: else:
raise UserWarning("This method can only be called after" raise UserWarning("This method can only be called after"
" debug has been specified.") " debug has been specified.")

View File

@ -4,6 +4,7 @@ import copy
import types import types
import logging import logging
import copy_reg import copy_reg
import warnings
import numpy as np import numpy as np
@ -13,8 +14,7 @@ from .utils import (text_in_bbox, get_table_index, get_score, count_empty,
__all__ = ['Stream'] __all__ = ['Stream']
logger = logging.getLogger('app_logger')
logger = logging.getLogger("app_logger")
def _reduce_method(m): def _reduce_method(m):
@ -297,9 +297,9 @@ class Stream:
ltchar = get_text_objects(layout, ltype="char") ltchar = get_text_objects(layout, ltype="char")
width, height = dim width, height = dim
bname, __ = os.path.splitext(pdfname) bname, __ = os.path.splitext(pdfname)
logger.info('Parsing tables from {0}.'.format(os.path.basename(bname))) logger.info('Processing {0}.'.format(os.path.basename(bname)))
if not lttextlh: if not lttextlh:
logger.warning("{0}: PDF has no text. It may be an image.".format( warnings.warn("{0}: Page contains no text.".format(
os.path.basename(bname))) os.path.basename(bname)))
return {os.path.basename(bname): None} return {os.path.basename(bname): None}
@ -312,7 +312,8 @@ class Stream:
if self.table_area is not None: if self.table_area is not None:
if self.columns is not None: if self.columns is not None:
if len(self.table_area) != len(self.columns): if len(self.table_area) != len(self.columns):
raise ValueError("Length of table area and columns should be equal.") raise ValueError("{0}: Length of table area and columns"
"should be equal.".format(os.path.basename(bname)))
table_bbox = {} table_bbox = {}
for area in self.table_area: for area in self.table_area:
@ -370,9 +371,8 @@ class Stream:
len_non_mode = len(filter(lambda x: x != ncols, elements)) len_non_mode = len(filter(lambda x: x != ncols, elements))
if ncols == 1: if ncols == 1:
# no tables detected # no tables detected
logger.warning("{}: Only one column was detected, the pdf" warnings.warn("{0}: Page contains no tables.".format(
" may have no tables.".format( os.path.basename(bname)))
os.path.basename(bname)))
cols = [(t.x0, t.x1) cols = [(t.x0, t.x1)
for r in rows_grouped if len(r) == ncols for t in r] for r in rows_grouped if len(r) == ncols for t in r]
cols = _merge_columns(sorted(cols), mtol=mtolerance[table_no]) cols = _merge_columns(sorted(cols), mtol=mtolerance[table_no])

View File

@ -528,7 +528,7 @@ def get_table_index(table, t, direction, split_text=False, flag_size=True):
else: else:
lt_col_overlap.append(-1) lt_col_overlap.append(-1)
if len(filter(lambda x: x != -1, lt_col_overlap)) == 0: if len(filter(lambda x: x != -1, lt_col_overlap)) == 0:
logging.warning("Text doesn't fit any column.") logging.warning("Text did not fit any column.")
r_idx = r r_idx = r
c_idx = lt_col_overlap.index(max(lt_col_overlap)) c_idx = lt_col_overlap.index(max(lt_col_overlap))
break break
@ -576,8 +576,7 @@ def get_score(error_weights):
try: try:
score = 0 score = 0
if sum([ew[0] for ew in error_weights]) != SCORE_VAL: if sum([ew[0] for ew in error_weights]) != SCORE_VAL:
raise ValueError("Please assign a valid weightage to each parameter" raise ValueError("Sum of weights should be equal to 100.")
" such that their sum is equal to 100")
for ew in error_weights: for ew in error_weights:
weight = ew[0] / len(ew[1]) weight = ew[0] / len(ew[1])
for error_percentage in ew[1]: for error_percentage in ew[1]: