Don't let processes modify instance attributes

pull/2/head
Vinayak Mehta 2017-02-07 22:13:33 +05:30
parent 970256e19d
commit bc86346154
5 changed files with 77 additions and 43 deletions

View File

@ -1,5 +1,6 @@
from __future__ import division
import os
import sys
import types
import logging
import copy_reg
@ -15,6 +16,8 @@ from .utils import (scale_to_pdf, scale_to_image, segments_bbox, text_in_bbox,
__all__ = ['Lattice']
logger = logging.getLogger("app_logger")
def _reduce_method(m):
if m.im_self is None:
@ -209,8 +212,9 @@ class Lattice:
ltchar = get_text_objects(layout, ltype="char")
width, height = dim
bname, __ = os.path.splitext(pdfname)
logger.info('Parsing tables from {0}.'.format(bname))
if not ltchar:
logging.warning("{0}: PDF has no text. It may be an image.".format(
logger.warning("{0}: PDF has no text. It may be an image.".format(
os.path.basename(bname)))
return {os.path.basename(bname): None}
@ -265,7 +269,7 @@ class Lattice:
table_bbox = find_table_joints(contours, vmask, hmask)
if len(self.mtol) == 1 and self.mtol[0] == 2:
self.mtol = self.mtol * len(table_bbox)
mtolerance = self.mtol * len(table_bbox)
if self.debug:
self.debug_images = (img, table_bbox)
@ -279,9 +283,8 @@ class Lattice:
page = {}
tables = {}
table_no = 0
# sort tables based on y-coord
for k in sorted(table_bbox.keys(), key=lambda x: x[1], reverse=True):
for table_no, k in enumerate(sorted(table_bbox.keys(), key=lambda x: x[1], reverse=True)):
# select elements which lie within table_bbox
table_data = {}
t_bbox = {}
@ -297,9 +300,9 @@ class Lattice:
cols.extend([k[0], k[2]])
rows.extend([k[1], k[3]])
# sort horizontal and vertical segments
cols = merge_close_values(sorted(cols), mtol=self.mtol[table_no])
cols = merge_close_values(sorted(cols), mtol=mtolerance[table_no])
rows = merge_close_values(
sorted(rows, reverse=True), mtol=self.mtol[table_no])
sorted(rows, reverse=True), mtol=mtolerance[table_no])
# make grid using x and y coord of shortlisted rows and cols
cols = [(cols[i], cols[i + 1])
for i in range(0, len(cols) - 1)]
@ -309,9 +312,9 @@ class Lattice:
if self.headers is not None and self.headers[table_no] != [""]:
self.headers[table_no] = self.headers[table_no].split(',')
if len(self.headers[table_no]) != len(cols):
logging.warning("Length of header ({0}) specified for table is not"
" equal to the number of columns ({1}) detected.".format(
len(self.headers[table_no]), len(cols)))
logger.warning("Length of header ({0}) specified for table is not"
" equal to the number of columns ({1}) detected.".format(
len(self.headers[table_no]), len(cols)))
while len(self.headers[table_no]) != len(cols):
self.headers[table_no].append('')
@ -361,7 +364,6 @@ class Lattice:
table_data['nrows'] = len(ar)
table_data['ncols'] = len(ar[0])
tables['table-{0}'.format(table_no + 1)] = table_data
table_no += 1
page[os.path.basename(bname)] = tables
if self.debug:

View File

@ -1,5 +1,6 @@
import os
import shutil
import logging
import tempfile
import itertools
import multiprocessing as mp
@ -12,6 +13,8 @@ from .utils import get_page_layout, get_text_objects, get_rotation
__all__ = ['Pdf']
logger = logging.getLogger("app_logger")
def _parse_page_numbers(pagenos):
"""Converts list of dicts to list of ints.
@ -72,14 +75,16 @@ class Pdf:
raise TypeError("Only PDF format is supported right now.")
self.pagenos = _parse_page_numbers(pagenos)
self.parallel = parallel
self.cpu_count = mp.cpu_count()
self.pool = mp.Pool(processes=self.cpu_count)
if self.parallel:
self.cpu_count = mp.cpu_count()
self.pool = mp.Pool(processes=self.cpu_count)
self.clean = clean
self.temp = tempfile.mkdtemp()
def split(self):
"""Splits file into single page pdfs.
"""
logger.info('Splitting pages...')
infile = PdfFileReader(open(self.pdfname, 'rb'), strict=False)
for p in self.pagenos:
sp_path = os.path.join(self.temp, 'page-{0}.pdf'.format(p))

View File

@ -13,6 +13,8 @@ from .utils import (text_in_bbox, get_table_index, get_score, count_empty,
__all__ = ['Stream']
logger = logging.getLogger("app_logger")
def _reduce_method(m):
if m.im_self is None:
@ -299,8 +301,9 @@ class Stream:
ltchar = get_text_objects(layout, ltype="char")
width, height = dim
bname, __ = os.path.splitext(pdfname)
logger.info('Parsing tables from {0}'.format(bname))
if not lttextlh:
logging.warning("{0}: PDF has no text. It may be an image.".format(
logger.warning("{0}: PDF has no text. It may be an image.".format(
os.path.basename(bname)))
return {os.path.basename(bname): None}
@ -329,15 +332,14 @@ class Stream:
table_bbox = {(0, 0, width, height): None}
if len(self.ytol) == 1 and self.ytol[0] == 2:
self.ytol = self.ytol * len(table_bbox)
ytolerance = self.ytol * len(table_bbox)
if len(self.mtol) == 1 and self.mtol[0] == 0:
self.mtol = self.mtol * len(table_bbox)
mtolerance = self.mtol * len(table_bbox)
page = {}
tables = {}
table_no = 0
# sort tables based on y-coord
for k in sorted(table_bbox.keys(), key=lambda x: x[1], reverse=True):
for table_no, k in enumerate(sorted(table_bbox.keys(), key=lambda x: x[1], reverse=True)):
# select elements which lie within table_bbox
table_data = {}
t_bbox = {}
@ -348,7 +350,7 @@ class Stream:
for direction in t_bbox:
t_bbox[direction].sort(key=lambda x: (-x.y0, x.x0))
text_x_min, text_y_min, text_x_max, text_y_max = _text_bbox(t_bbox)
rows_grouped = _group_rows(t_bbox['horizontal'], ytol=self.ytol[table_no])
rows_grouped = _group_rows(t_bbox['horizontal'], ytol=ytolerance[table_no])
rows = _join_rows(rows_grouped, text_y_max, text_y_min)
elements = [len(r) for r in rows_grouped]
@ -369,13 +371,13 @@ class Stream:
len_non_mode = len(filter(lambda x: x != ncols, elements))
if ncols == 1 and not self.debug:
# no tables detected
logging.warning("{}: Only one column was detected, the pdf"
" may have no tables. Specify ncols if"
" the pdf has tables.".format(
logger.warning("{}: Only one column was detected, the pdf"
" may have no tables. Specify ncols if"
" the pdf has tables.".format(
os.path.basename(bname)))
cols = [(t.x0, t.x1)
for r in rows_grouped if len(r) == ncols for t in r]
cols = _merge_columns(sorted(cols), mtol=self.mtol[table_no])
cols = _merge_columns(sorted(cols), mtol=mtolerance[table_no])
inner_text = []
for i in range(1, len(cols)):
left = cols[i - 1][1]
@ -387,15 +389,15 @@ class Stream:
for t in t_bbox[direction]
if t.x0 > cols[-1][1] or t.x1 < cols[0][0]]
inner_text.extend(outer_text)
cols = _add_columns(cols, inner_text, self.ytol[table_no])
cols = _add_columns(cols, inner_text, ytolerance[table_no])
cols = _join_columns(cols, text_x_min, text_x_max)
if self.headers is not None and self.headers[table_no] != [""]:
self.headers[table_no] = self.headers[table_no].split(',')
if len(self.headers[table_no]) != len(cols):
logging.warning("Length of header ({0}) specified for table is not"
" equal to the number of columns ({1}) detected.".format(
len(self.headers[table_no]), len(cols)))
logger.warning("Length of header ({0}) specified for table is not"
" equal to the number of columns ({1}) detected.".format(
len(self.headers[table_no]), len(cols)))
while len(self.headers[table_no]) != len(cols):
self.headers[table_no].append('')
@ -434,7 +436,6 @@ class Stream:
table_data['nrows'] = len(ar)
table_data['ncols'] = len(ar[0])
tables['table-{0}'.format(table_no + 1)] = table_data
table_no += 1
page[os.path.basename(bname)] = tables
return page

View File

@ -178,6 +178,33 @@ def scale_to_pdf(tables, v_segments, h_segments, factors):
return tables_new, v_segments_new, h_segments_new
def setup_logging(log_filepath):
"""Setup logging
Args:
log_filepath (string): Path to log file
Returns:
logging.Logger: Logger object
"""
logger = logging.getLogger("app_logger")
logger.setLevel(logging.DEBUG)
# Log File Handler (Associating one log file per webservice run)
log_file_handler = logging.FileHandler(log_filepath,
mode='a',
encoding='utf-8')
log_file_handler.setLevel(logging.DEBUG)
format_string = '%(asctime)s - %(levelname)s - %(funcName)s - %(message)s'
formatter = logging.Formatter(format_string, datefmt='%Y-%m-%dT%H:%M:%S')
log_file_handler.setFormatter(formatter)
logger.addHandler(log_file_handler)
# Stream Log Handler (For console)
stream_log_handler = logging.StreamHandler()
stream_log_handler.setLevel(logging.INFO)
formatter = logging.Formatter(format_string, datefmt='%Y-%m-%dT%H:%M:%S')
stream_log_handler.setFormatter(formatter)
logger.addHandler(stream_log_handler)
return logger
def get_rotation(lttextlh, lttextlv, ltchar):
"""Detects if text in table is vertical or not using the current
transformation matrix (CTM) and returns its orientation.

View File

@ -4,7 +4,6 @@ import os
import sys
import glob
import time
import logging
import zipfile
import warnings
@ -18,6 +17,7 @@ from camelot.pdf import Pdf
from camelot.lattice import Lattice
from camelot.stream import Stream
from camelot.ocr import OCR
from camelot import utils
doc = """
@ -34,7 +34,7 @@ options:
Example: -p 1,3-6,10 [default: 1]
-P, --parallel Parallelize the parsing process.
-f, --format <format> Output format. (csv,tsv,html,json,xlsx) [default: csv]
-l, --log Log to file.
-l, --log <logfile> Log to file.
-o, --output <directory> Output directory.
-M, --cmargin <cmargin> Char margin. Chars closer than cmargin are
grouped together to form a word. [default: 1.0]
@ -349,12 +349,11 @@ if __name__ == '__main__':
scorename = ''.join([scorename, '_info.csv'])
pngname, __ = os.path.splitext(filename)
if args['--log']:
FORMAT = '%(asctime)s - %(levelname)s - %(message)s'
if args['--output']:
logname = os.path.join(args['--output'], os.path.basename(logname))
logging.basicConfig(
filename=logname, filemode='w', format=FORMAT, level=logging.DEBUG)
FORMAT = '%(asctime)s - %(levelname)s - %(message)s'
if args['--log'] is not None:
logger = utils.setup_logging(args['--log'])
else:
logger = utils.setup_logging(os.path.join(os.getcwd(), 'camelot.log'))
p = []
if args['--pages'] == '1':
@ -383,7 +382,7 @@ if __name__ == '__main__':
manager = Pdf(Lattice(table_area=tarea, fill=fill, headers=header,
mtol=mtol, scale=int(args['--scale']),
invert=args['--invert'], margins=margins,
split_text=args['--split_text'], flag_size=['--flag_size'],
split_text=args['--split_text'], flag_size=args['--flag_size'],
shift_text=shift_text, debug=args['--debug']),
filename,
pagenos=p,
@ -393,7 +392,7 @@ if __name__ == '__main__':
processing_time = time.time() - start_time
vprint("Finished processing in", processing_time, "seconds")
logging.info("Finished processing in " + str(processing_time) + " seconds")
logger.info("Finished processing in " + str(processing_time) + " seconds")
if args['--plot']:
if args['--output']:
@ -439,7 +438,7 @@ if __name__ == '__main__':
if args['--debug']:
manager.debug_plot()
except Exception as e:
logging.exception(e.message, exc_info=True)
logger.exception(e.message, exc_info=True)
sys.exit()
elif args['<method>'] == 'stream':
try:
@ -455,7 +454,7 @@ if __name__ == '__main__':
manager = Pdf(Stream(table_area=tarea, columns=columns,
ncolumns=ncolumns, headers=header, ytol=ytol,
mtol=mtol, margins=margins, split_text=args['--split_text'],
flag_size=['--flag_size'], debug=args['--debug']),
flag_size=args['--flag_size'], debug=args['--debug']),
filename,
pagenos=p,
parallel=args['--parallel'],
@ -464,7 +463,7 @@ if __name__ == '__main__':
processing_time = time.time() - start_time
vprint("Finished processing in", processing_time, "seconds")
logging.info("Finished processing in " + str(processing_time) + " seconds")
logger.info("Finished processing in " + str(processing_time) + " seconds")
if args['--plot']:
if args['--output']:
@ -509,7 +508,7 @@ if __name__ == '__main__':
if args['--debug']:
manager.debug_plot()
except Exception as e:
logging.exception(e.message, exc_info=True)
logger.exception(e.message, exc_info=True)
sys.exit()
elif args['<method>'] == 'ocr':
try:
@ -526,7 +525,7 @@ if __name__ == '__main__':
processing_time = time.time() - start_time
vprint("Finished processing in", processing_time, "seconds")
logging.info("Finished processing in " + str(processing_time) + " seconds")
logger.info("Finished processing in " + str(processing_time) + " seconds")
if args['--plot']:
if args['--output']:
@ -572,7 +571,7 @@ if __name__ == '__main__':
if args['--debug']:
manager.debug_plot()
except Exception as e:
logging.exception(e.message, exc_info=True)
logger.exception(e.message, exc_info=True)
sys.exit()
if args['--debug']: