Fix warnings and exceptions
parent
18e1a799a1
commit
5c5bd6199c
|
|
@ -5,6 +5,7 @@ import copy
|
||||||
import types
|
import types
|
||||||
import logging
|
import logging
|
||||||
import copy_reg
|
import copy_reg
|
||||||
|
import warnings
|
||||||
import subprocess
|
import subprocess
|
||||||
|
|
||||||
from .imgproc import (adaptive_threshold, find_lines, find_table_contours,
|
from .imgproc import (adaptive_threshold, find_lines, find_table_contours,
|
||||||
|
|
@ -16,8 +17,7 @@ from .utils import (scale_to_pdf, scale_to_image, segments_bbox, text_in_bbox,
|
||||||
|
|
||||||
|
|
||||||
__all__ = ['Lattice']
|
__all__ = ['Lattice']
|
||||||
|
logger = logging.getLogger('app_logger')
|
||||||
logger = logging.getLogger("app_logger")
|
|
||||||
|
|
||||||
|
|
||||||
def _reduce_method(m):
|
def _reduce_method(m):
|
||||||
|
|
@ -231,9 +231,9 @@ class Lattice:
|
||||||
ltchar = get_text_objects(layout, ltype="char")
|
ltchar = get_text_objects(layout, ltype="char")
|
||||||
width, height = dim
|
width, height = dim
|
||||||
bname, __ = os.path.splitext(pdfname)
|
bname, __ = os.path.splitext(pdfname)
|
||||||
logger.info('Parsing tables from {0}.'.format(os.path.basename(bname)))
|
logger.info('Processing {0}.'.format(os.path.basename(bname)))
|
||||||
if not ltchar:
|
if not ltchar:
|
||||||
logger.warning("{0}: PDF has no text. It may be an image.".format(
|
warnings.warn("{0}: Page contains no text.".format(
|
||||||
os.path.basename(bname)))
|
os.path.basename(bname)))
|
||||||
return {os.path.basename(bname): None}
|
return {os.path.basename(bname): None}
|
||||||
|
|
||||||
|
|
@ -269,7 +269,8 @@ class Lattice:
|
||||||
if self.table_area is not None:
|
if self.table_area is not None:
|
||||||
if self.fill is not None:
|
if self.fill is not None:
|
||||||
if len(self.table_area) != len(self.fill):
|
if len(self.table_area) != len(self.fill):
|
||||||
raise ValueError("Length of table area and fill should be equal.")
|
raise ValueError("{0}: Length of table area and fill should"
|
||||||
|
" be equal.".format(os.path.basename(bname)))
|
||||||
|
|
||||||
areas = []
|
areas = []
|
||||||
for area in self.table_area:
|
for area in self.table_area:
|
||||||
|
|
|
||||||
|
|
@ -1,5 +1,6 @@
|
||||||
import os
|
import os
|
||||||
import copy
|
import copy
|
||||||
|
import logging
|
||||||
import subprocess
|
import subprocess
|
||||||
|
|
||||||
import pyocr
|
import pyocr
|
||||||
|
|
@ -11,6 +12,10 @@ from .imgproc import (adaptive_threshold, find_lines, find_table_contours,
|
||||||
from .utils import merge_close_values, encode_list
|
from .utils import merge_close_values, encode_list
|
||||||
|
|
||||||
|
|
||||||
|
__all__ = ['OCRLattice', 'OCRStream']
|
||||||
|
logger = logging.getLogger('app_logger')
|
||||||
|
|
||||||
|
|
||||||
class OCRLattice:
|
class OCRLattice:
|
||||||
"""Lattice, but for images.
|
"""Lattice, but for images.
|
||||||
|
|
||||||
|
|
@ -81,6 +86,7 @@ class OCRLattice:
|
||||||
|
|
||||||
bname, __ = os.path.splitext(pdfname)
|
bname, __ = os.path.splitext(pdfname)
|
||||||
imagename = ''.join([bname, '.png'])
|
imagename = ''.join([bname, '.png'])
|
||||||
|
logger.info('Processing {0}.'.format(os.path.basename(bname)))
|
||||||
|
|
||||||
gs_call = [
|
gs_call = [
|
||||||
"-q", "-sDEVICE=png16m", "-o", imagename, "-r{0}".format(self.dpi),
|
"-q", "-sDEVICE=png16m", "-o", imagename, "-r{0}".format(self.dpi),
|
||||||
|
|
@ -230,6 +236,7 @@ class OCRStream:
|
||||||
|
|
||||||
bname, __ = os.path.splitext(pdfname)
|
bname, __ = os.path.splitext(pdfname)
|
||||||
imagename = ''.join([bname, '.png'])
|
imagename = ''.join([bname, '.png'])
|
||||||
|
logger.info('Processing {0}.'.format(os.path.basename(bname)))
|
||||||
|
|
||||||
gs_call = [
|
gs_call = [
|
||||||
"-q", "-sDEVICE=png16m", "-o", imagename, "-r{0}".format(self.dpi),
|
"-q", "-sDEVICE=png16m", "-o", imagename, "-r{0}".format(self.dpi),
|
||||||
|
|
@ -252,7 +259,8 @@ class OCRStream:
|
||||||
if self.table_area is not None:
|
if self.table_area is not None:
|
||||||
if self.columns is not None:
|
if self.columns is not None:
|
||||||
if len(self.table_area) != len(self.columns):
|
if len(self.table_area) != len(self.columns):
|
||||||
raise ValueError("Length of table area and columns should be equal.")
|
raise ValueError("{0}: Length of table area and columns"
|
||||||
|
"should be equal.".format(os.path.basename(bname)))
|
||||||
|
|
||||||
table_bbox = {}
|
table_bbox = {}
|
||||||
for area in self.table_area:
|
for area in self.table_area:
|
||||||
|
|
|
||||||
|
|
@ -1,6 +1,5 @@
|
||||||
import os
|
import os
|
||||||
import shutil
|
import shutil
|
||||||
import logging
|
|
||||||
import tempfile
|
import tempfile
|
||||||
import itertools
|
import itertools
|
||||||
import multiprocessing as mp
|
import multiprocessing as mp
|
||||||
|
|
@ -14,8 +13,6 @@ from .utils import get_page_layout, get_text_objects, get_rotation
|
||||||
|
|
||||||
__all__ = ['Pdf']
|
__all__ = ['Pdf']
|
||||||
|
|
||||||
logger = logging.getLogger("app_logger")
|
|
||||||
|
|
||||||
|
|
||||||
def _parse_page_numbers(pagenos):
|
def _parse_page_numbers(pagenos):
|
||||||
"""Converts list of dicts to list of ints.
|
"""Converts list of dicts to list of ints.
|
||||||
|
|
@ -104,7 +101,7 @@ class Pdf:
|
||||||
self.extractor = extractor
|
self.extractor = extractor
|
||||||
self.pdfname = pdfname
|
self.pdfname = pdfname
|
||||||
if not self.pdfname.endswith('.pdf'):
|
if not self.pdfname.endswith('.pdf'):
|
||||||
raise TypeError("Only PDF format is supported right now.")
|
raise TypeError("File format not supported.")
|
||||||
self.pagenos = _parse_page_numbers(pagenos)
|
self.pagenos = _parse_page_numbers(pagenos)
|
||||||
self.parallel = parallel
|
self.parallel = parallel
|
||||||
if self.parallel:
|
if self.parallel:
|
||||||
|
|
@ -116,7 +113,6 @@ class Pdf:
|
||||||
def split(self):
|
def split(self):
|
||||||
"""Splits file into single page pdfs.
|
"""Splits file into single page pdfs.
|
||||||
"""
|
"""
|
||||||
logger.info('Splitting pages...')
|
|
||||||
if self.parallel:
|
if self.parallel:
|
||||||
pfunc = partial(_save_page, self.temp, self.pdfname)
|
pfunc = partial(_save_page, self.temp, self.pdfname)
|
||||||
self.pool.map(pfunc, self.pagenos)
|
self.pool.map(pfunc, self.pagenos)
|
||||||
|
|
@ -211,7 +207,7 @@ class Pdf:
|
||||||
plt.imshow(img)
|
plt.imshow(img)
|
||||||
plt.show()
|
plt.show()
|
||||||
except AttributeError:
|
except AttributeError:
|
||||||
raise ValueError("This option only be used with Lattice.")
|
raise ValueError("This option can only be used with Lattice.")
|
||||||
elif self.debug == 'joint':
|
elif self.debug == 'joint':
|
||||||
try:
|
try:
|
||||||
for img, table_bbox in self.debug_images:
|
for img, table_bbox in self.debug_images:
|
||||||
|
|
@ -227,7 +223,7 @@ class Pdf:
|
||||||
plt.imshow(img)
|
plt.imshow(img)
|
||||||
plt.show()
|
plt.show()
|
||||||
except AttributeError:
|
except AttributeError:
|
||||||
raise ValueError("This option only be used with Lattice.")
|
raise ValueError("This option can only be used with Lattice.")
|
||||||
elif self.debug == 'line':
|
elif self.debug == 'line':
|
||||||
try:
|
try:
|
||||||
for v_s, h_s in self.debug_segments:
|
for v_s, h_s in self.debug_segments:
|
||||||
|
|
@ -237,7 +233,7 @@ class Pdf:
|
||||||
plt.plot([h[0], h[2]], [h[1], h[3]])
|
plt.plot([h[0], h[2]], [h[1], h[3]])
|
||||||
plt.show()
|
plt.show()
|
||||||
except AttributeError:
|
except AttributeError:
|
||||||
raise ValueError("This option only be used with Lattice.")
|
raise ValueError("This option can only be used with Lattice.")
|
||||||
elif self.debug == 'table':
|
elif self.debug == 'table':
|
||||||
try:
|
try:
|
||||||
for tables in self.debug_tables:
|
for tables in self.debug_tables:
|
||||||
|
|
@ -266,7 +262,7 @@ class Pdf:
|
||||||
table.cells[r][c].rb[1]])
|
table.cells[r][c].rb[1]])
|
||||||
plt.show()
|
plt.show()
|
||||||
except AttributeError:
|
except AttributeError:
|
||||||
raise ValueError("This option only be used with Lattice.")
|
raise ValueError("This option can only be used with Lattice.")
|
||||||
else:
|
else:
|
||||||
raise UserWarning("This method can only be called after"
|
raise UserWarning("This method can only be called after"
|
||||||
" debug has been specified.")
|
" debug has been specified.")
|
||||||
|
|
@ -4,6 +4,7 @@ import copy
|
||||||
import types
|
import types
|
||||||
import logging
|
import logging
|
||||||
import copy_reg
|
import copy_reg
|
||||||
|
import warnings
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
|
|
@ -13,8 +14,7 @@ from .utils import (text_in_bbox, get_table_index, get_score, count_empty,
|
||||||
|
|
||||||
|
|
||||||
__all__ = ['Stream']
|
__all__ = ['Stream']
|
||||||
|
logger = logging.getLogger('app_logger')
|
||||||
logger = logging.getLogger("app_logger")
|
|
||||||
|
|
||||||
|
|
||||||
def _reduce_method(m):
|
def _reduce_method(m):
|
||||||
|
|
@ -297,9 +297,9 @@ class Stream:
|
||||||
ltchar = get_text_objects(layout, ltype="char")
|
ltchar = get_text_objects(layout, ltype="char")
|
||||||
width, height = dim
|
width, height = dim
|
||||||
bname, __ = os.path.splitext(pdfname)
|
bname, __ = os.path.splitext(pdfname)
|
||||||
logger.info('Parsing tables from {0}.'.format(os.path.basename(bname)))
|
logger.info('Processing {0}.'.format(os.path.basename(bname)))
|
||||||
if not lttextlh:
|
if not lttextlh:
|
||||||
logger.warning("{0}: PDF has no text. It may be an image.".format(
|
warnings.warn("{0}: Page contains no text.".format(
|
||||||
os.path.basename(bname)))
|
os.path.basename(bname)))
|
||||||
return {os.path.basename(bname): None}
|
return {os.path.basename(bname): None}
|
||||||
|
|
||||||
|
|
@ -312,7 +312,8 @@ class Stream:
|
||||||
if self.table_area is not None:
|
if self.table_area is not None:
|
||||||
if self.columns is not None:
|
if self.columns is not None:
|
||||||
if len(self.table_area) != len(self.columns):
|
if len(self.table_area) != len(self.columns):
|
||||||
raise ValueError("Length of table area and columns should be equal.")
|
raise ValueError("{0}: Length of table area and columns"
|
||||||
|
"should be equal.".format(os.path.basename(bname)))
|
||||||
|
|
||||||
table_bbox = {}
|
table_bbox = {}
|
||||||
for area in self.table_area:
|
for area in self.table_area:
|
||||||
|
|
@ -370,8 +371,7 @@ class Stream:
|
||||||
len_non_mode = len(filter(lambda x: x != ncols, elements))
|
len_non_mode = len(filter(lambda x: x != ncols, elements))
|
||||||
if ncols == 1:
|
if ncols == 1:
|
||||||
# no tables detected
|
# no tables detected
|
||||||
logger.warning("{}: Only one column was detected, the pdf"
|
warnings.warn("{0}: Page contains no tables.".format(
|
||||||
" may have no tables.".format(
|
|
||||||
os.path.basename(bname)))
|
os.path.basename(bname)))
|
||||||
cols = [(t.x0, t.x1)
|
cols = [(t.x0, t.x1)
|
||||||
for r in rows_grouped if len(r) == ncols for t in r]
|
for r in rows_grouped if len(r) == ncols for t in r]
|
||||||
|
|
|
||||||
|
|
@ -528,7 +528,7 @@ def get_table_index(table, t, direction, split_text=False, flag_size=True):
|
||||||
else:
|
else:
|
||||||
lt_col_overlap.append(-1)
|
lt_col_overlap.append(-1)
|
||||||
if len(filter(lambda x: x != -1, lt_col_overlap)) == 0:
|
if len(filter(lambda x: x != -1, lt_col_overlap)) == 0:
|
||||||
logging.warning("Text doesn't fit any column.")
|
logging.warning("Text did not fit any column.")
|
||||||
r_idx = r
|
r_idx = r
|
||||||
c_idx = lt_col_overlap.index(max(lt_col_overlap))
|
c_idx = lt_col_overlap.index(max(lt_col_overlap))
|
||||||
break
|
break
|
||||||
|
|
@ -576,8 +576,7 @@ def get_score(error_weights):
|
||||||
try:
|
try:
|
||||||
score = 0
|
score = 0
|
||||||
if sum([ew[0] for ew in error_weights]) != SCORE_VAL:
|
if sum([ew[0] for ew in error_weights]) != SCORE_VAL:
|
||||||
raise ValueError("Please assign a valid weightage to each parameter"
|
raise ValueError("Sum of weights should be equal to 100.")
|
||||||
" such that their sum is equal to 100")
|
|
||||||
for ew in error_weights:
|
for ew in error_weights:
|
||||||
weight = ew[0] / len(ew[1])
|
weight = ew[0] / len(ew[1])
|
||||||
for error_percentage in ew[1]:
|
for error_percentage in ew[1]:
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue