Don't let processes modify instance attributes
parent
970256e19d
commit
bc86346154
|
|
@ -1,5 +1,6 @@
|
|||
from __future__ import division
|
||||
import os
|
||||
import sys
|
||||
import types
|
||||
import logging
|
||||
import copy_reg
|
||||
|
|
@ -15,6 +16,8 @@ from .utils import (scale_to_pdf, scale_to_image, segments_bbox, text_in_bbox,
|
|||
|
||||
__all__ = ['Lattice']
|
||||
|
||||
logger = logging.getLogger("app_logger")
|
||||
|
||||
|
||||
def _reduce_method(m):
|
||||
if m.im_self is None:
|
||||
|
|
@ -209,8 +212,9 @@ class Lattice:
|
|||
ltchar = get_text_objects(layout, ltype="char")
|
||||
width, height = dim
|
||||
bname, __ = os.path.splitext(pdfname)
|
||||
logger.info('Parsing tables from {0}.'.format(bname))
|
||||
if not ltchar:
|
||||
logging.warning("{0}: PDF has no text. It may be an image.".format(
|
||||
logger.warning("{0}: PDF has no text. It may be an image.".format(
|
||||
os.path.basename(bname)))
|
||||
return {os.path.basename(bname): None}
|
||||
|
||||
|
|
@ -265,7 +269,7 @@ class Lattice:
|
|||
table_bbox = find_table_joints(contours, vmask, hmask)
|
||||
|
||||
if len(self.mtol) == 1 and self.mtol[0] == 2:
|
||||
self.mtol = self.mtol * len(table_bbox)
|
||||
mtolerance = self.mtol * len(table_bbox)
|
||||
|
||||
if self.debug:
|
||||
self.debug_images = (img, table_bbox)
|
||||
|
|
@ -279,9 +283,8 @@ class Lattice:
|
|||
|
||||
page = {}
|
||||
tables = {}
|
||||
table_no = 0
|
||||
# sort tables based on y-coord
|
||||
for k in sorted(table_bbox.keys(), key=lambda x: x[1], reverse=True):
|
||||
for table_no, k in enumerate(sorted(table_bbox.keys(), key=lambda x: x[1], reverse=True)):
|
||||
# select elements which lie within table_bbox
|
||||
table_data = {}
|
||||
t_bbox = {}
|
||||
|
|
@ -297,9 +300,9 @@ class Lattice:
|
|||
cols.extend([k[0], k[2]])
|
||||
rows.extend([k[1], k[3]])
|
||||
# sort horizontal and vertical segments
|
||||
cols = merge_close_values(sorted(cols), mtol=self.mtol[table_no])
|
||||
cols = merge_close_values(sorted(cols), mtol=mtolerance[table_no])
|
||||
rows = merge_close_values(
|
||||
sorted(rows, reverse=True), mtol=self.mtol[table_no])
|
||||
sorted(rows, reverse=True), mtol=mtolerance[table_no])
|
||||
# make grid using x and y coord of shortlisted rows and cols
|
||||
cols = [(cols[i], cols[i + 1])
|
||||
for i in range(0, len(cols) - 1)]
|
||||
|
|
@ -309,7 +312,7 @@ class Lattice:
|
|||
if self.headers is not None and self.headers[table_no] != [""]:
|
||||
self.headers[table_no] = self.headers[table_no].split(',')
|
||||
if len(self.headers[table_no]) != len(cols):
|
||||
logging.warning("Length of header ({0}) specified for table is not"
|
||||
logger.warning("Length of header ({0}) specified for table is not"
|
||||
" equal to the number of columns ({1}) detected.".format(
|
||||
len(self.headers[table_no]), len(cols)))
|
||||
while len(self.headers[table_no]) != len(cols):
|
||||
|
|
@ -361,7 +364,6 @@ class Lattice:
|
|||
table_data['nrows'] = len(ar)
|
||||
table_data['ncols'] = len(ar[0])
|
||||
tables['table-{0}'.format(table_no + 1)] = table_data
|
||||
table_no += 1
|
||||
page[os.path.basename(bname)] = tables
|
||||
|
||||
if self.debug:
|
||||
|
|
|
|||
|
|
@ -1,5 +1,6 @@
|
|||
import os
|
||||
import shutil
|
||||
import logging
|
||||
import tempfile
|
||||
import itertools
|
||||
import multiprocessing as mp
|
||||
|
|
@ -12,6 +13,8 @@ from .utils import get_page_layout, get_text_objects, get_rotation
|
|||
|
||||
__all__ = ['Pdf']
|
||||
|
||||
logger = logging.getLogger("app_logger")
|
||||
|
||||
|
||||
def _parse_page_numbers(pagenos):
|
||||
"""Converts list of dicts to list of ints.
|
||||
|
|
@ -72,6 +75,7 @@ class Pdf:
|
|||
raise TypeError("Only PDF format is supported right now.")
|
||||
self.pagenos = _parse_page_numbers(pagenos)
|
||||
self.parallel = parallel
|
||||
if self.parallel:
|
||||
self.cpu_count = mp.cpu_count()
|
||||
self.pool = mp.Pool(processes=self.cpu_count)
|
||||
self.clean = clean
|
||||
|
|
@ -80,6 +84,7 @@ class Pdf:
|
|||
def split(self):
|
||||
"""Splits file into single page pdfs.
|
||||
"""
|
||||
logger.info('Splitting pages...')
|
||||
infile = PdfFileReader(open(self.pdfname, 'rb'), strict=False)
|
||||
for p in self.pagenos:
|
||||
sp_path = os.path.join(self.temp, 'page-{0}.pdf'.format(p))
|
||||
|
|
|
|||
|
|
@ -13,6 +13,8 @@ from .utils import (text_in_bbox, get_table_index, get_score, count_empty,
|
|||
|
||||
__all__ = ['Stream']
|
||||
|
||||
logger = logging.getLogger("app_logger")
|
||||
|
||||
|
||||
def _reduce_method(m):
|
||||
if m.im_self is None:
|
||||
|
|
@ -299,8 +301,9 @@ class Stream:
|
|||
ltchar = get_text_objects(layout, ltype="char")
|
||||
width, height = dim
|
||||
bname, __ = os.path.splitext(pdfname)
|
||||
logger.info('Parsing tables from {0}'.format(bname))
|
||||
if not lttextlh:
|
||||
logging.warning("{0}: PDF has no text. It may be an image.".format(
|
||||
logger.warning("{0}: PDF has no text. It may be an image.".format(
|
||||
os.path.basename(bname)))
|
||||
return {os.path.basename(bname): None}
|
||||
|
||||
|
|
@ -329,15 +332,14 @@ class Stream:
|
|||
table_bbox = {(0, 0, width, height): None}
|
||||
|
||||
if len(self.ytol) == 1 and self.ytol[0] == 2:
|
||||
self.ytol = self.ytol * len(table_bbox)
|
||||
ytolerance = self.ytol * len(table_bbox)
|
||||
if len(self.mtol) == 1 and self.mtol[0] == 0:
|
||||
self.mtol = self.mtol * len(table_bbox)
|
||||
mtolerance = self.mtol * len(table_bbox)
|
||||
|
||||
page = {}
|
||||
tables = {}
|
||||
table_no = 0
|
||||
# sort tables based on y-coord
|
||||
for k in sorted(table_bbox.keys(), key=lambda x: x[1], reverse=True):
|
||||
for table_no, k in enumerate(sorted(table_bbox.keys(), key=lambda x: x[1], reverse=True)):
|
||||
# select elements which lie within table_bbox
|
||||
table_data = {}
|
||||
t_bbox = {}
|
||||
|
|
@ -348,7 +350,7 @@ class Stream:
|
|||
for direction in t_bbox:
|
||||
t_bbox[direction].sort(key=lambda x: (-x.y0, x.x0))
|
||||
text_x_min, text_y_min, text_x_max, text_y_max = _text_bbox(t_bbox)
|
||||
rows_grouped = _group_rows(t_bbox['horizontal'], ytol=self.ytol[table_no])
|
||||
rows_grouped = _group_rows(t_bbox['horizontal'], ytol=ytolerance[table_no])
|
||||
rows = _join_rows(rows_grouped, text_y_max, text_y_min)
|
||||
elements = [len(r) for r in rows_grouped]
|
||||
|
||||
|
|
@ -369,13 +371,13 @@ class Stream:
|
|||
len_non_mode = len(filter(lambda x: x != ncols, elements))
|
||||
if ncols == 1 and not self.debug:
|
||||
# no tables detected
|
||||
logging.warning("{}: Only one column was detected, the pdf"
|
||||
logger.warning("{}: Only one column was detected, the pdf"
|
||||
" may have no tables. Specify ncols if"
|
||||
" the pdf has tables.".format(
|
||||
os.path.basename(bname)))
|
||||
cols = [(t.x0, t.x1)
|
||||
for r in rows_grouped if len(r) == ncols for t in r]
|
||||
cols = _merge_columns(sorted(cols), mtol=self.mtol[table_no])
|
||||
cols = _merge_columns(sorted(cols), mtol=mtolerance[table_no])
|
||||
inner_text = []
|
||||
for i in range(1, len(cols)):
|
||||
left = cols[i - 1][1]
|
||||
|
|
@ -387,13 +389,13 @@ class Stream:
|
|||
for t in t_bbox[direction]
|
||||
if t.x0 > cols[-1][1] or t.x1 < cols[0][0]]
|
||||
inner_text.extend(outer_text)
|
||||
cols = _add_columns(cols, inner_text, self.ytol[table_no])
|
||||
cols = _add_columns(cols, inner_text, ytolerance[table_no])
|
||||
cols = _join_columns(cols, text_x_min, text_x_max)
|
||||
|
||||
if self.headers is not None and self.headers[table_no] != [""]:
|
||||
self.headers[table_no] = self.headers[table_no].split(',')
|
||||
if len(self.headers[table_no]) != len(cols):
|
||||
logging.warning("Length of header ({0}) specified for table is not"
|
||||
logger.warning("Length of header ({0}) specified for table is not"
|
||||
" equal to the number of columns ({1}) detected.".format(
|
||||
len(self.headers[table_no]), len(cols)))
|
||||
while len(self.headers[table_no]) != len(cols):
|
||||
|
|
@ -434,7 +436,6 @@ class Stream:
|
|||
table_data['nrows'] = len(ar)
|
||||
table_data['ncols'] = len(ar[0])
|
||||
tables['table-{0}'.format(table_no + 1)] = table_data
|
||||
table_no += 1
|
||||
page[os.path.basename(bname)] = tables
|
||||
|
||||
return page
|
||||
|
|
@ -178,6 +178,33 @@ def scale_to_pdf(tables, v_segments, h_segments, factors):
|
|||
return tables_new, v_segments_new, h_segments_new
|
||||
|
||||
|
||||
def setup_logging(log_filepath):
|
||||
"""Setup logging
|
||||
Args:
|
||||
log_filepath (string): Path to log file
|
||||
Returns:
|
||||
logging.Logger: Logger object
|
||||
"""
|
||||
logger = logging.getLogger("app_logger")
|
||||
logger.setLevel(logging.DEBUG)
|
||||
# Log File Handler (Associating one log file per webservice run)
|
||||
log_file_handler = logging.FileHandler(log_filepath,
|
||||
mode='a',
|
||||
encoding='utf-8')
|
||||
log_file_handler.setLevel(logging.DEBUG)
|
||||
format_string = '%(asctime)s - %(levelname)s - %(funcName)s - %(message)s'
|
||||
formatter = logging.Formatter(format_string, datefmt='%Y-%m-%dT%H:%M:%S')
|
||||
log_file_handler.setFormatter(formatter)
|
||||
logger.addHandler(log_file_handler)
|
||||
# Stream Log Handler (For console)
|
||||
stream_log_handler = logging.StreamHandler()
|
||||
stream_log_handler.setLevel(logging.INFO)
|
||||
formatter = logging.Formatter(format_string, datefmt='%Y-%m-%dT%H:%M:%S')
|
||||
stream_log_handler.setFormatter(formatter)
|
||||
logger.addHandler(stream_log_handler)
|
||||
return logger
|
||||
|
||||
|
||||
def get_rotation(lttextlh, lttextlv, ltchar):
|
||||
"""Detects if text in table is vertical or not using the current
|
||||
transformation matrix (CTM) and returns its orientation.
|
||||
|
|
|
|||
|
|
@ -4,7 +4,6 @@ import os
|
|||
import sys
|
||||
import glob
|
||||
import time
|
||||
import logging
|
||||
import zipfile
|
||||
import warnings
|
||||
|
||||
|
|
@ -18,6 +17,7 @@ from camelot.pdf import Pdf
|
|||
from camelot.lattice import Lattice
|
||||
from camelot.stream import Stream
|
||||
from camelot.ocr import OCR
|
||||
from camelot import utils
|
||||
|
||||
|
||||
doc = """
|
||||
|
|
@ -34,7 +34,7 @@ options:
|
|||
Example: -p 1,3-6,10 [default: 1]
|
||||
-P, --parallel Parallelize the parsing process.
|
||||
-f, --format <format> Output format. (csv,tsv,html,json,xlsx) [default: csv]
|
||||
-l, --log Log to file.
|
||||
-l, --log <logfile> Log to file.
|
||||
-o, --output <directory> Output directory.
|
||||
-M, --cmargin <cmargin> Char margin. Chars closer than cmargin are
|
||||
grouped together to form a word. [default: 1.0]
|
||||
|
|
@ -349,12 +349,11 @@ if __name__ == '__main__':
|
|||
scorename = ''.join([scorename, '_info.csv'])
|
||||
pngname, __ = os.path.splitext(filename)
|
||||
|
||||
if args['--log']:
|
||||
FORMAT = '%(asctime)s - %(levelname)s - %(message)s'
|
||||
if args['--output']:
|
||||
logname = os.path.join(args['--output'], os.path.basename(logname))
|
||||
logging.basicConfig(
|
||||
filename=logname, filemode='w', format=FORMAT, level=logging.DEBUG)
|
||||
if args['--log'] is not None:
|
||||
logger = utils.setup_logging(args['--log'])
|
||||
else:
|
||||
logger = utils.setup_logging(os.path.join(os.getcwd(), 'camelot.log'))
|
||||
|
||||
p = []
|
||||
if args['--pages'] == '1':
|
||||
|
|
@ -383,7 +382,7 @@ if __name__ == '__main__':
|
|||
manager = Pdf(Lattice(table_area=tarea, fill=fill, headers=header,
|
||||
mtol=mtol, scale=int(args['--scale']),
|
||||
invert=args['--invert'], margins=margins,
|
||||
split_text=args['--split_text'], flag_size=['--flag_size'],
|
||||
split_text=args['--split_text'], flag_size=args['--flag_size'],
|
||||
shift_text=shift_text, debug=args['--debug']),
|
||||
filename,
|
||||
pagenos=p,
|
||||
|
|
@ -393,7 +392,7 @@ if __name__ == '__main__':
|
|||
|
||||
processing_time = time.time() - start_time
|
||||
vprint("Finished processing in", processing_time, "seconds")
|
||||
logging.info("Finished processing in " + str(processing_time) + " seconds")
|
||||
logger.info("Finished processing in " + str(processing_time) + " seconds")
|
||||
|
||||
if args['--plot']:
|
||||
if args['--output']:
|
||||
|
|
@ -439,7 +438,7 @@ if __name__ == '__main__':
|
|||
if args['--debug']:
|
||||
manager.debug_plot()
|
||||
except Exception as e:
|
||||
logging.exception(e.message, exc_info=True)
|
||||
logger.exception(e.message, exc_info=True)
|
||||
sys.exit()
|
||||
elif args['<method>'] == 'stream':
|
||||
try:
|
||||
|
|
@ -455,7 +454,7 @@ if __name__ == '__main__':
|
|||
manager = Pdf(Stream(table_area=tarea, columns=columns,
|
||||
ncolumns=ncolumns, headers=header, ytol=ytol,
|
||||
mtol=mtol, margins=margins, split_text=args['--split_text'],
|
||||
flag_size=['--flag_size'], debug=args['--debug']),
|
||||
flag_size=args['--flag_size'], debug=args['--debug']),
|
||||
filename,
|
||||
pagenos=p,
|
||||
parallel=args['--parallel'],
|
||||
|
|
@ -464,7 +463,7 @@ if __name__ == '__main__':
|
|||
|
||||
processing_time = time.time() - start_time
|
||||
vprint("Finished processing in", processing_time, "seconds")
|
||||
logging.info("Finished processing in " + str(processing_time) + " seconds")
|
||||
logger.info("Finished processing in " + str(processing_time) + " seconds")
|
||||
|
||||
if args['--plot']:
|
||||
if args['--output']:
|
||||
|
|
@ -509,7 +508,7 @@ if __name__ == '__main__':
|
|||
if args['--debug']:
|
||||
manager.debug_plot()
|
||||
except Exception as e:
|
||||
logging.exception(e.message, exc_info=True)
|
||||
logger.exception(e.message, exc_info=True)
|
||||
sys.exit()
|
||||
elif args['<method>'] == 'ocr':
|
||||
try:
|
||||
|
|
@ -526,7 +525,7 @@ if __name__ == '__main__':
|
|||
|
||||
processing_time = time.time() - start_time
|
||||
vprint("Finished processing in", processing_time, "seconds")
|
||||
logging.info("Finished processing in " + str(processing_time) + " seconds")
|
||||
logger.info("Finished processing in " + str(processing_time) + " seconds")
|
||||
|
||||
if args['--plot']:
|
||||
if args['--output']:
|
||||
|
|
@ -572,7 +571,7 @@ if __name__ == '__main__':
|
|||
if args['--debug']:
|
||||
manager.debug_plot()
|
||||
except Exception as e:
|
||||
logging.exception(e.message, exc_info=True)
|
||||
logger.exception(e.message, exc_info=True)
|
||||
sys.exit()
|
||||
|
||||
if args['--debug']:
|
||||
|
|
|
|||
Loading…
Reference in New Issue