Don't let processes modify instance attributes

pull/2/head
Vinayak Mehta 2017-02-07 22:13:33 +05:30
parent 970256e19d
commit bc86346154
5 changed files with 77 additions and 43 deletions

View File

@ -1,5 +1,6 @@
from __future__ import division from __future__ import division
import os import os
import sys
import types import types
import logging import logging
import copy_reg import copy_reg
@ -15,6 +16,8 @@ from .utils import (scale_to_pdf, scale_to_image, segments_bbox, text_in_bbox,
__all__ = ['Lattice'] __all__ = ['Lattice']
logger = logging.getLogger("app_logger")
def _reduce_method(m): def _reduce_method(m):
if m.im_self is None: if m.im_self is None:
@ -209,8 +212,9 @@ class Lattice:
ltchar = get_text_objects(layout, ltype="char") ltchar = get_text_objects(layout, ltype="char")
width, height = dim width, height = dim
bname, __ = os.path.splitext(pdfname) bname, __ = os.path.splitext(pdfname)
logger.info('Parsing tables from {0}.'.format(bname))
if not ltchar: if not ltchar:
logging.warning("{0}: PDF has no text. It may be an image.".format( logger.warning("{0}: PDF has no text. It may be an image.".format(
os.path.basename(bname))) os.path.basename(bname)))
return {os.path.basename(bname): None} return {os.path.basename(bname): None}
@ -265,7 +269,7 @@ class Lattice:
table_bbox = find_table_joints(contours, vmask, hmask) table_bbox = find_table_joints(contours, vmask, hmask)
if len(self.mtol) == 1 and self.mtol[0] == 2: if len(self.mtol) == 1 and self.mtol[0] == 2:
self.mtol = self.mtol * len(table_bbox) mtolerance = self.mtol * len(table_bbox)
if self.debug: if self.debug:
self.debug_images = (img, table_bbox) self.debug_images = (img, table_bbox)
@ -279,9 +283,8 @@ class Lattice:
page = {} page = {}
tables = {} tables = {}
table_no = 0
# sort tables based on y-coord # sort tables based on y-coord
for k in sorted(table_bbox.keys(), key=lambda x: x[1], reverse=True): for table_no, k in enumerate(sorted(table_bbox.keys(), key=lambda x: x[1], reverse=True)):
# select elements which lie within table_bbox # select elements which lie within table_bbox
table_data = {} table_data = {}
t_bbox = {} t_bbox = {}
@ -297,9 +300,9 @@ class Lattice:
cols.extend([k[0], k[2]]) cols.extend([k[0], k[2]])
rows.extend([k[1], k[3]]) rows.extend([k[1], k[3]])
# sort horizontal and vertical segments # sort horizontal and vertical segments
cols = merge_close_values(sorted(cols), mtol=self.mtol[table_no]) cols = merge_close_values(sorted(cols), mtol=mtolerance[table_no])
rows = merge_close_values( rows = merge_close_values(
sorted(rows, reverse=True), mtol=self.mtol[table_no]) sorted(rows, reverse=True), mtol=mtolerance[table_no])
# make grid using x and y coord of shortlisted rows and cols # make grid using x and y coord of shortlisted rows and cols
cols = [(cols[i], cols[i + 1]) cols = [(cols[i], cols[i + 1])
for i in range(0, len(cols) - 1)] for i in range(0, len(cols) - 1)]
@ -309,9 +312,9 @@ class Lattice:
if self.headers is not None and self.headers[table_no] != [""]: if self.headers is not None and self.headers[table_no] != [""]:
self.headers[table_no] = self.headers[table_no].split(',') self.headers[table_no] = self.headers[table_no].split(',')
if len(self.headers[table_no]) != len(cols): if len(self.headers[table_no]) != len(cols):
logging.warning("Length of header ({0}) specified for table is not" logger.warning("Length of header ({0}) specified for table is not"
" equal to the number of columns ({1}) detected.".format( " equal to the number of columns ({1}) detected.".format(
len(self.headers[table_no]), len(cols))) len(self.headers[table_no]), len(cols)))
while len(self.headers[table_no]) != len(cols): while len(self.headers[table_no]) != len(cols):
self.headers[table_no].append('') self.headers[table_no].append('')
@ -361,7 +364,6 @@ class Lattice:
table_data['nrows'] = len(ar) table_data['nrows'] = len(ar)
table_data['ncols'] = len(ar[0]) table_data['ncols'] = len(ar[0])
tables['table-{0}'.format(table_no + 1)] = table_data tables['table-{0}'.format(table_no + 1)] = table_data
table_no += 1
page[os.path.basename(bname)] = tables page[os.path.basename(bname)] = tables
if self.debug: if self.debug:

View File

@ -1,5 +1,6 @@
import os import os
import shutil import shutil
import logging
import tempfile import tempfile
import itertools import itertools
import multiprocessing as mp import multiprocessing as mp
@ -12,6 +13,8 @@ from .utils import get_page_layout, get_text_objects, get_rotation
__all__ = ['Pdf'] __all__ = ['Pdf']
logger = logging.getLogger("app_logger")
def _parse_page_numbers(pagenos): def _parse_page_numbers(pagenos):
"""Converts list of dicts to list of ints. """Converts list of dicts to list of ints.
@ -72,14 +75,16 @@ class Pdf:
raise TypeError("Only PDF format is supported right now.") raise TypeError("Only PDF format is supported right now.")
self.pagenos = _parse_page_numbers(pagenos) self.pagenos = _parse_page_numbers(pagenos)
self.parallel = parallel self.parallel = parallel
self.cpu_count = mp.cpu_count() if self.parallel:
self.pool = mp.Pool(processes=self.cpu_count) self.cpu_count = mp.cpu_count()
self.pool = mp.Pool(processes=self.cpu_count)
self.clean = clean self.clean = clean
self.temp = tempfile.mkdtemp() self.temp = tempfile.mkdtemp()
def split(self): def split(self):
"""Splits file into single page pdfs. """Splits file into single page pdfs.
""" """
logger.info('Splitting pages...')
infile = PdfFileReader(open(self.pdfname, 'rb'), strict=False) infile = PdfFileReader(open(self.pdfname, 'rb'), strict=False)
for p in self.pagenos: for p in self.pagenos:
sp_path = os.path.join(self.temp, 'page-{0}.pdf'.format(p)) sp_path = os.path.join(self.temp, 'page-{0}.pdf'.format(p))

View File

@ -13,6 +13,8 @@ from .utils import (text_in_bbox, get_table_index, get_score, count_empty,
__all__ = ['Stream'] __all__ = ['Stream']
logger = logging.getLogger("app_logger")
def _reduce_method(m): def _reduce_method(m):
if m.im_self is None: if m.im_self is None:
@ -299,8 +301,9 @@ class Stream:
ltchar = get_text_objects(layout, ltype="char") ltchar = get_text_objects(layout, ltype="char")
width, height = dim width, height = dim
bname, __ = os.path.splitext(pdfname) bname, __ = os.path.splitext(pdfname)
logger.info('Parsing tables from {0}'.format(bname))
if not lttextlh: if not lttextlh:
logging.warning("{0}: PDF has no text. It may be an image.".format( logger.warning("{0}: PDF has no text. It may be an image.".format(
os.path.basename(bname))) os.path.basename(bname)))
return {os.path.basename(bname): None} return {os.path.basename(bname): None}
@ -329,15 +332,14 @@ class Stream:
table_bbox = {(0, 0, width, height): None} table_bbox = {(0, 0, width, height): None}
if len(self.ytol) == 1 and self.ytol[0] == 2: if len(self.ytol) == 1 and self.ytol[0] == 2:
self.ytol = self.ytol * len(table_bbox) ytolerance = self.ytol * len(table_bbox)
if len(self.mtol) == 1 and self.mtol[0] == 0: if len(self.mtol) == 1 and self.mtol[0] == 0:
self.mtol = self.mtol * len(table_bbox) mtolerance = self.mtol * len(table_bbox)
page = {} page = {}
tables = {} tables = {}
table_no = 0
# sort tables based on y-coord # sort tables based on y-coord
for k in sorted(table_bbox.keys(), key=lambda x: x[1], reverse=True): for table_no, k in enumerate(sorted(table_bbox.keys(), key=lambda x: x[1], reverse=True)):
# select elements which lie within table_bbox # select elements which lie within table_bbox
table_data = {} table_data = {}
t_bbox = {} t_bbox = {}
@ -348,7 +350,7 @@ class Stream:
for direction in t_bbox: for direction in t_bbox:
t_bbox[direction].sort(key=lambda x: (-x.y0, x.x0)) t_bbox[direction].sort(key=lambda x: (-x.y0, x.x0))
text_x_min, text_y_min, text_x_max, text_y_max = _text_bbox(t_bbox) text_x_min, text_y_min, text_x_max, text_y_max = _text_bbox(t_bbox)
rows_grouped = _group_rows(t_bbox['horizontal'], ytol=self.ytol[table_no]) rows_grouped = _group_rows(t_bbox['horizontal'], ytol=ytolerance[table_no])
rows = _join_rows(rows_grouped, text_y_max, text_y_min) rows = _join_rows(rows_grouped, text_y_max, text_y_min)
elements = [len(r) for r in rows_grouped] elements = [len(r) for r in rows_grouped]
@ -369,13 +371,13 @@ class Stream:
len_non_mode = len(filter(lambda x: x != ncols, elements)) len_non_mode = len(filter(lambda x: x != ncols, elements))
if ncols == 1 and not self.debug: if ncols == 1 and not self.debug:
# no tables detected # no tables detected
logging.warning("{}: Only one column was detected, the pdf" logger.warning("{}: Only one column was detected, the pdf"
" may have no tables. Specify ncols if" " may have no tables. Specify ncols if"
" the pdf has tables.".format( " the pdf has tables.".format(
os.path.basename(bname))) os.path.basename(bname)))
cols = [(t.x0, t.x1) cols = [(t.x0, t.x1)
for r in rows_grouped if len(r) == ncols for t in r] for r in rows_grouped if len(r) == ncols for t in r]
cols = _merge_columns(sorted(cols), mtol=self.mtol[table_no]) cols = _merge_columns(sorted(cols), mtol=mtolerance[table_no])
inner_text = [] inner_text = []
for i in range(1, len(cols)): for i in range(1, len(cols)):
left = cols[i - 1][1] left = cols[i - 1][1]
@ -387,15 +389,15 @@ class Stream:
for t in t_bbox[direction] for t in t_bbox[direction]
if t.x0 > cols[-1][1] or t.x1 < cols[0][0]] if t.x0 > cols[-1][1] or t.x1 < cols[0][0]]
inner_text.extend(outer_text) inner_text.extend(outer_text)
cols = _add_columns(cols, inner_text, self.ytol[table_no]) cols = _add_columns(cols, inner_text, ytolerance[table_no])
cols = _join_columns(cols, text_x_min, text_x_max) cols = _join_columns(cols, text_x_min, text_x_max)
if self.headers is not None and self.headers[table_no] != [""]: if self.headers is not None and self.headers[table_no] != [""]:
self.headers[table_no] = self.headers[table_no].split(',') self.headers[table_no] = self.headers[table_no].split(',')
if len(self.headers[table_no]) != len(cols): if len(self.headers[table_no]) != len(cols):
logging.warning("Length of header ({0}) specified for table is not" logger.warning("Length of header ({0}) specified for table is not"
" equal to the number of columns ({1}) detected.".format( " equal to the number of columns ({1}) detected.".format(
len(self.headers[table_no]), len(cols))) len(self.headers[table_no]), len(cols)))
while len(self.headers[table_no]) != len(cols): while len(self.headers[table_no]) != len(cols):
self.headers[table_no].append('') self.headers[table_no].append('')
@ -434,7 +436,6 @@ class Stream:
table_data['nrows'] = len(ar) table_data['nrows'] = len(ar)
table_data['ncols'] = len(ar[0]) table_data['ncols'] = len(ar[0])
tables['table-{0}'.format(table_no + 1)] = table_data tables['table-{0}'.format(table_no + 1)] = table_data
table_no += 1
page[os.path.basename(bname)] = tables page[os.path.basename(bname)] = tables
return page return page

View File

@ -178,6 +178,33 @@ def scale_to_pdf(tables, v_segments, h_segments, factors):
return tables_new, v_segments_new, h_segments_new return tables_new, v_segments_new, h_segments_new
def setup_logging(log_filepath):
"""Setup logging
Args:
log_filepath (string): Path to log file
Returns:
logging.Logger: Logger object
"""
logger = logging.getLogger("app_logger")
logger.setLevel(logging.DEBUG)
# Log File Handler (Associating one log file per webservice run)
log_file_handler = logging.FileHandler(log_filepath,
mode='a',
encoding='utf-8')
log_file_handler.setLevel(logging.DEBUG)
format_string = '%(asctime)s - %(levelname)s - %(funcName)s - %(message)s'
formatter = logging.Formatter(format_string, datefmt='%Y-%m-%dT%H:%M:%S')
log_file_handler.setFormatter(formatter)
logger.addHandler(log_file_handler)
# Stream Log Handler (For console)
stream_log_handler = logging.StreamHandler()
stream_log_handler.setLevel(logging.INFO)
formatter = logging.Formatter(format_string, datefmt='%Y-%m-%dT%H:%M:%S')
stream_log_handler.setFormatter(formatter)
logger.addHandler(stream_log_handler)
return logger
def get_rotation(lttextlh, lttextlv, ltchar): def get_rotation(lttextlh, lttextlv, ltchar):
"""Detects if text in table is vertical or not using the current """Detects if text in table is vertical or not using the current
transformation matrix (CTM) and returns its orientation. transformation matrix (CTM) and returns its orientation.

View File

@ -4,7 +4,6 @@ import os
import sys import sys
import glob import glob
import time import time
import logging
import zipfile import zipfile
import warnings import warnings
@ -18,6 +17,7 @@ from camelot.pdf import Pdf
from camelot.lattice import Lattice from camelot.lattice import Lattice
from camelot.stream import Stream from camelot.stream import Stream
from camelot.ocr import OCR from camelot.ocr import OCR
from camelot import utils
doc = """ doc = """
@ -34,7 +34,7 @@ options:
Example: -p 1,3-6,10 [default: 1] Example: -p 1,3-6,10 [default: 1]
-P, --parallel Parallelize the parsing process. -P, --parallel Parallelize the parsing process.
-f, --format <format> Output format. (csv,tsv,html,json,xlsx) [default: csv] -f, --format <format> Output format. (csv,tsv,html,json,xlsx) [default: csv]
-l, --log Log to file. -l, --log <logfile> Log to file.
-o, --output <directory> Output directory. -o, --output <directory> Output directory.
-M, --cmargin <cmargin> Char margin. Chars closer than cmargin are -M, --cmargin <cmargin> Char margin. Chars closer than cmargin are
grouped together to form a word. [default: 1.0] grouped together to form a word. [default: 1.0]
@ -349,12 +349,11 @@ if __name__ == '__main__':
scorename = ''.join([scorename, '_info.csv']) scorename = ''.join([scorename, '_info.csv'])
pngname, __ = os.path.splitext(filename) pngname, __ = os.path.splitext(filename)
if args['--log']: FORMAT = '%(asctime)s - %(levelname)s - %(message)s'
FORMAT = '%(asctime)s - %(levelname)s - %(message)s' if args['--log'] is not None:
if args['--output']: logger = utils.setup_logging(args['--log'])
logname = os.path.join(args['--output'], os.path.basename(logname)) else:
logging.basicConfig( logger = utils.setup_logging(os.path.join(os.getcwd(), 'camelot.log'))
filename=logname, filemode='w', format=FORMAT, level=logging.DEBUG)
p = [] p = []
if args['--pages'] == '1': if args['--pages'] == '1':
@ -383,7 +382,7 @@ if __name__ == '__main__':
manager = Pdf(Lattice(table_area=tarea, fill=fill, headers=header, manager = Pdf(Lattice(table_area=tarea, fill=fill, headers=header,
mtol=mtol, scale=int(args['--scale']), mtol=mtol, scale=int(args['--scale']),
invert=args['--invert'], margins=margins, invert=args['--invert'], margins=margins,
split_text=args['--split_text'], flag_size=['--flag_size'], split_text=args['--split_text'], flag_size=args['--flag_size'],
shift_text=shift_text, debug=args['--debug']), shift_text=shift_text, debug=args['--debug']),
filename, filename,
pagenos=p, pagenos=p,
@ -393,7 +392,7 @@ if __name__ == '__main__':
processing_time = time.time() - start_time processing_time = time.time() - start_time
vprint("Finished processing in", processing_time, "seconds") vprint("Finished processing in", processing_time, "seconds")
logging.info("Finished processing in " + str(processing_time) + " seconds") logger.info("Finished processing in " + str(processing_time) + " seconds")
if args['--plot']: if args['--plot']:
if args['--output']: if args['--output']:
@ -439,7 +438,7 @@ if __name__ == '__main__':
if args['--debug']: if args['--debug']:
manager.debug_plot() manager.debug_plot()
except Exception as e: except Exception as e:
logging.exception(e.message, exc_info=True) logger.exception(e.message, exc_info=True)
sys.exit() sys.exit()
elif args['<method>'] == 'stream': elif args['<method>'] == 'stream':
try: try:
@ -455,7 +454,7 @@ if __name__ == '__main__':
manager = Pdf(Stream(table_area=tarea, columns=columns, manager = Pdf(Stream(table_area=tarea, columns=columns,
ncolumns=ncolumns, headers=header, ytol=ytol, ncolumns=ncolumns, headers=header, ytol=ytol,
mtol=mtol, margins=margins, split_text=args['--split_text'], mtol=mtol, margins=margins, split_text=args['--split_text'],
flag_size=['--flag_size'], debug=args['--debug']), flag_size=args['--flag_size'], debug=args['--debug']),
filename, filename,
pagenos=p, pagenos=p,
parallel=args['--parallel'], parallel=args['--parallel'],
@ -464,7 +463,7 @@ if __name__ == '__main__':
processing_time = time.time() - start_time processing_time = time.time() - start_time
vprint("Finished processing in", processing_time, "seconds") vprint("Finished processing in", processing_time, "seconds")
logging.info("Finished processing in " + str(processing_time) + " seconds") logger.info("Finished processing in " + str(processing_time) + " seconds")
if args['--plot']: if args['--plot']:
if args['--output']: if args['--output']:
@ -509,7 +508,7 @@ if __name__ == '__main__':
if args['--debug']: if args['--debug']:
manager.debug_plot() manager.debug_plot()
except Exception as e: except Exception as e:
logging.exception(e.message, exc_info=True) logger.exception(e.message, exc_info=True)
sys.exit() sys.exit()
elif args['<method>'] == 'ocr': elif args['<method>'] == 'ocr':
try: try:
@ -526,7 +525,7 @@ if __name__ == '__main__':
processing_time = time.time() - start_time processing_time = time.time() - start_time
vprint("Finished processing in", processing_time, "seconds") vprint("Finished processing in", processing_time, "seconds")
logging.info("Finished processing in " + str(processing_time) + " seconds") logger.info("Finished processing in " + str(processing_time) + " seconds")
if args['--plot']: if args['--plot']:
if args['--output']: if args['--output']:
@ -572,7 +571,7 @@ if __name__ == '__main__':
if args['--debug']: if args['--debug']:
manager.debug_plot() manager.debug_plot()
except Exception as e: except Exception as e:
logging.exception(e.message, exc_info=True) logger.exception(e.message, exc_info=True)
sys.exit() sys.exit()
if args['--debug']: if args['--debug']: