Fix warnings and exceptions
parent
18e1a799a1
commit
5c5bd6199c
|
|
@ -5,6 +5,7 @@ import copy
|
|||
import types
|
||||
import logging
|
||||
import copy_reg
|
||||
import warnings
|
||||
import subprocess
|
||||
|
||||
from .imgproc import (adaptive_threshold, find_lines, find_table_contours,
|
||||
|
|
@ -16,8 +17,7 @@ from .utils import (scale_to_pdf, scale_to_image, segments_bbox, text_in_bbox,
|
|||
|
||||
|
||||
__all__ = ['Lattice']
|
||||
|
||||
logger = logging.getLogger("app_logger")
|
||||
logger = logging.getLogger('app_logger')
|
||||
|
||||
|
||||
def _reduce_method(m):
|
||||
|
|
@ -231,9 +231,9 @@ class Lattice:
|
|||
ltchar = get_text_objects(layout, ltype="char")
|
||||
width, height = dim
|
||||
bname, __ = os.path.splitext(pdfname)
|
||||
logger.info('Parsing tables from {0}.'.format(os.path.basename(bname)))
|
||||
logger.info('Processing {0}.'.format(os.path.basename(bname)))
|
||||
if not ltchar:
|
||||
logger.warning("{0}: PDF has no text. It may be an image.".format(
|
||||
warnings.warn("{0}: Page contains no text.".format(
|
||||
os.path.basename(bname)))
|
||||
return {os.path.basename(bname): None}
|
||||
|
||||
|
|
@ -269,7 +269,8 @@ class Lattice:
|
|||
if self.table_area is not None:
|
||||
if self.fill is not None:
|
||||
if len(self.table_area) != len(self.fill):
|
||||
raise ValueError("Length of table area and fill should be equal.")
|
||||
raise ValueError("{0}: Length of table area and fill should"
|
||||
" be equal.".format(os.path.basename(bname)))
|
||||
|
||||
areas = []
|
||||
for area in self.table_area:
|
||||
|
|
|
|||
|
|
@ -1,5 +1,6 @@
|
|||
import os
|
||||
import copy
|
||||
import logging
|
||||
import subprocess
|
||||
|
||||
import pyocr
|
||||
|
|
@ -11,6 +12,10 @@ from .imgproc import (adaptive_threshold, find_lines, find_table_contours,
|
|||
from .utils import merge_close_values, encode_list
|
||||
|
||||
|
||||
__all__ = ['OCRLattice', 'OCRStream']
|
||||
logger = logging.getLogger('app_logger')
|
||||
|
||||
|
||||
class OCRLattice:
|
||||
"""Lattice, but for images.
|
||||
|
||||
|
|
@ -81,6 +86,7 @@ class OCRLattice:
|
|||
|
||||
bname, __ = os.path.splitext(pdfname)
|
||||
imagename = ''.join([bname, '.png'])
|
||||
logger.info('Processing {0}.'.format(os.path.basename(bname)))
|
||||
|
||||
gs_call = [
|
||||
"-q", "-sDEVICE=png16m", "-o", imagename, "-r{0}".format(self.dpi),
|
||||
|
|
@ -230,6 +236,7 @@ class OCRStream:
|
|||
|
||||
bname, __ = os.path.splitext(pdfname)
|
||||
imagename = ''.join([bname, '.png'])
|
||||
logger.info('Processing {0}.'.format(os.path.basename(bname)))
|
||||
|
||||
gs_call = [
|
||||
"-q", "-sDEVICE=png16m", "-o", imagename, "-r{0}".format(self.dpi),
|
||||
|
|
@ -252,7 +259,8 @@ class OCRStream:
|
|||
if self.table_area is not None:
|
||||
if self.columns is not None:
|
||||
if len(self.table_area) != len(self.columns):
|
||||
raise ValueError("Length of table area and columns should be equal.")
|
||||
raise ValueError("{0}: Length of table area and columns"
|
||||
"should be equal.".format(os.path.basename(bname)))
|
||||
|
||||
table_bbox = {}
|
||||
for area in self.table_area:
|
||||
|
|
|
|||
|
|
@ -1,6 +1,5 @@
|
|||
import os
|
||||
import shutil
|
||||
import logging
|
||||
import tempfile
|
||||
import itertools
|
||||
import multiprocessing as mp
|
||||
|
|
@ -14,8 +13,6 @@ from .utils import get_page_layout, get_text_objects, get_rotation
|
|||
|
||||
__all__ = ['Pdf']
|
||||
|
||||
logger = logging.getLogger("app_logger")
|
||||
|
||||
|
||||
def _parse_page_numbers(pagenos):
|
||||
"""Converts list of dicts to list of ints.
|
||||
|
|
@ -104,7 +101,7 @@ class Pdf:
|
|||
self.extractor = extractor
|
||||
self.pdfname = pdfname
|
||||
if not self.pdfname.endswith('.pdf'):
|
||||
raise TypeError("Only PDF format is supported right now.")
|
||||
raise TypeError("File format not supported.")
|
||||
self.pagenos = _parse_page_numbers(pagenos)
|
||||
self.parallel = parallel
|
||||
if self.parallel:
|
||||
|
|
@ -116,7 +113,6 @@ class Pdf:
|
|||
def split(self):
|
||||
"""Splits file into single page pdfs.
|
||||
"""
|
||||
logger.info('Splitting pages...')
|
||||
if self.parallel:
|
||||
pfunc = partial(_save_page, self.temp, self.pdfname)
|
||||
self.pool.map(pfunc, self.pagenos)
|
||||
|
|
@ -211,7 +207,7 @@ class Pdf:
|
|||
plt.imshow(img)
|
||||
plt.show()
|
||||
except AttributeError:
|
||||
raise ValueError("This option only be used with Lattice.")
|
||||
raise ValueError("This option can only be used with Lattice.")
|
||||
elif self.debug == 'joint':
|
||||
try:
|
||||
for img, table_bbox in self.debug_images:
|
||||
|
|
@ -227,7 +223,7 @@ class Pdf:
|
|||
plt.imshow(img)
|
||||
plt.show()
|
||||
except AttributeError:
|
||||
raise ValueError("This option only be used with Lattice.")
|
||||
raise ValueError("This option can only be used with Lattice.")
|
||||
elif self.debug == 'line':
|
||||
try:
|
||||
for v_s, h_s in self.debug_segments:
|
||||
|
|
@ -237,7 +233,7 @@ class Pdf:
|
|||
plt.plot([h[0], h[2]], [h[1], h[3]])
|
||||
plt.show()
|
||||
except AttributeError:
|
||||
raise ValueError("This option only be used with Lattice.")
|
||||
raise ValueError("This option can only be used with Lattice.")
|
||||
elif self.debug == 'table':
|
||||
try:
|
||||
for tables in self.debug_tables:
|
||||
|
|
@ -266,7 +262,7 @@ class Pdf:
|
|||
table.cells[r][c].rb[1]])
|
||||
plt.show()
|
||||
except AttributeError:
|
||||
raise ValueError("This option only be used with Lattice.")
|
||||
raise ValueError("This option can only be used with Lattice.")
|
||||
else:
|
||||
raise UserWarning("This method can only be called after"
|
||||
" debug has been specified.")
|
||||
|
|
@ -4,6 +4,7 @@ import copy
|
|||
import types
|
||||
import logging
|
||||
import copy_reg
|
||||
import warnings
|
||||
|
||||
import numpy as np
|
||||
|
||||
|
|
@ -13,8 +14,7 @@ from .utils import (text_in_bbox, get_table_index, get_score, count_empty,
|
|||
|
||||
|
||||
__all__ = ['Stream']
|
||||
|
||||
logger = logging.getLogger("app_logger")
|
||||
logger = logging.getLogger('app_logger')
|
||||
|
||||
|
||||
def _reduce_method(m):
|
||||
|
|
@ -297,9 +297,9 @@ class Stream:
|
|||
ltchar = get_text_objects(layout, ltype="char")
|
||||
width, height = dim
|
||||
bname, __ = os.path.splitext(pdfname)
|
||||
logger.info('Parsing tables from {0}.'.format(os.path.basename(bname)))
|
||||
logger.info('Processing {0}.'.format(os.path.basename(bname)))
|
||||
if not lttextlh:
|
||||
logger.warning("{0}: PDF has no text. It may be an image.".format(
|
||||
warnings.warn("{0}: Page contains no text.".format(
|
||||
os.path.basename(bname)))
|
||||
return {os.path.basename(bname): None}
|
||||
|
||||
|
|
@ -312,7 +312,8 @@ class Stream:
|
|||
if self.table_area is not None:
|
||||
if self.columns is not None:
|
||||
if len(self.table_area) != len(self.columns):
|
||||
raise ValueError("Length of table area and columns should be equal.")
|
||||
raise ValueError("{0}: Length of table area and columns"
|
||||
"should be equal.".format(os.path.basename(bname)))
|
||||
|
||||
table_bbox = {}
|
||||
for area in self.table_area:
|
||||
|
|
@ -370,9 +371,8 @@ class Stream:
|
|||
len_non_mode = len(filter(lambda x: x != ncols, elements))
|
||||
if ncols == 1:
|
||||
# no tables detected
|
||||
logger.warning("{}: Only one column was detected, the pdf"
|
||||
" may have no tables.".format(
|
||||
os.path.basename(bname)))
|
||||
warnings.warn("{0}: Page contains no tables.".format(
|
||||
os.path.basename(bname)))
|
||||
cols = [(t.x0, t.x1)
|
||||
for r in rows_grouped if len(r) == ncols for t in r]
|
||||
cols = _merge_columns(sorted(cols), mtol=mtolerance[table_no])
|
||||
|
|
|
|||
|
|
@ -528,7 +528,7 @@ def get_table_index(table, t, direction, split_text=False, flag_size=True):
|
|||
else:
|
||||
lt_col_overlap.append(-1)
|
||||
if len(filter(lambda x: x != -1, lt_col_overlap)) == 0:
|
||||
logging.warning("Text doesn't fit any column.")
|
||||
logging.warning("Text did not fit any column.")
|
||||
r_idx = r
|
||||
c_idx = lt_col_overlap.index(max(lt_col_overlap))
|
||||
break
|
||||
|
|
@ -576,8 +576,7 @@ def get_score(error_weights):
|
|||
try:
|
||||
score = 0
|
||||
if sum([ew[0] for ew in error_weights]) != SCORE_VAL:
|
||||
raise ValueError("Please assign a valid weightage to each parameter"
|
||||
" such that their sum is equal to 100")
|
||||
raise ValueError("Sum of weights should be equal to 100.")
|
||||
for ew in error_weights:
|
||||
weight = ew[0] / len(ew[1])
|
||||
for error_percentage in ew[1]:
|
||||
|
|
|
|||
Loading…
Reference in New Issue