diff --git a/camelot/lattice.py b/camelot/lattice.py index f9a86e3..d1088d0 100644 --- a/camelot/lattice.py +++ b/camelot/lattice.py @@ -1,5 +1,6 @@ from __future__ import division import os +import sys import types import logging import copy_reg @@ -15,6 +16,8 @@ from .utils import (scale_to_pdf, scale_to_image, segments_bbox, text_in_bbox, __all__ = ['Lattice'] +logger = logging.getLogger("app_logger") + def _reduce_method(m): if m.im_self is None: @@ -209,8 +212,9 @@ class Lattice: ltchar = get_text_objects(layout, ltype="char") width, height = dim bname, __ = os.path.splitext(pdfname) + logger.info('Parsing tables from {0}.'.format(bname)) if not ltchar: - logging.warning("{0}: PDF has no text. It may be an image.".format( + logger.warning("{0}: PDF has no text. It may be an image.".format( os.path.basename(bname))) return {os.path.basename(bname): None} @@ -265,7 +269,7 @@ class Lattice: table_bbox = find_table_joints(contours, vmask, hmask) if len(self.mtol) == 1 and self.mtol[0] == 2: - self.mtol = self.mtol * len(table_bbox) + mtolerance = self.mtol * len(table_bbox) if self.debug: self.debug_images = (img, table_bbox) @@ -279,9 +283,8 @@ class Lattice: page = {} tables = {} - table_no = 0 # sort tables based on y-coord - for k in sorted(table_bbox.keys(), key=lambda x: x[1], reverse=True): + for table_no, k in enumerate(sorted(table_bbox.keys(), key=lambda x: x[1], reverse=True)): # select elements which lie within table_bbox table_data = {} t_bbox = {} @@ -297,9 +300,9 @@ class Lattice: cols.extend([k[0], k[2]]) rows.extend([k[1], k[3]]) # sort horizontal and vertical segments - cols = merge_close_values(sorted(cols), mtol=self.mtol[table_no]) + cols = merge_close_values(sorted(cols), mtol=mtolerance[table_no]) rows = merge_close_values( - sorted(rows, reverse=True), mtol=self.mtol[table_no]) + sorted(rows, reverse=True), mtol=mtolerance[table_no]) # make grid using x and y coord of shortlisted rows and cols cols = [(cols[i], cols[i + 1]) for i in range(0, len(cols) - 1)] @@ -309,9 +312,9 @@ class Lattice: if self.headers is not None and self.headers[table_no] != [""]: self.headers[table_no] = self.headers[table_no].split(',') if len(self.headers[table_no]) != len(cols): - logging.warning("Length of header ({0}) specified for table is not" - " equal to the number of columns ({1}) detected.".format( - len(self.headers[table_no]), len(cols))) + logger.warning("Length of header ({0}) specified for table is not" + " equal to the number of columns ({1}) detected.".format( + len(self.headers[table_no]), len(cols))) while len(self.headers[table_no]) != len(cols): self.headers[table_no].append('') @@ -361,7 +364,6 @@ class Lattice: table_data['nrows'] = len(ar) table_data['ncols'] = len(ar[0]) tables['table-{0}'.format(table_no + 1)] = table_data - table_no += 1 page[os.path.basename(bname)] = tables if self.debug: diff --git a/camelot/pdf.py b/camelot/pdf.py index 85c2dbc..b3e9b47 100644 --- a/camelot/pdf.py +++ b/camelot/pdf.py @@ -1,5 +1,6 @@ import os import shutil +import logging import tempfile import itertools import multiprocessing as mp @@ -12,6 +13,8 @@ from .utils import get_page_layout, get_text_objects, get_rotation __all__ = ['Pdf'] +logger = logging.getLogger("app_logger") + def _parse_page_numbers(pagenos): """Converts list of dicts to list of ints. @@ -72,14 +75,16 @@ class Pdf: raise TypeError("Only PDF format is supported right now.") self.pagenos = _parse_page_numbers(pagenos) self.parallel = parallel - self.cpu_count = mp.cpu_count() - self.pool = mp.Pool(processes=self.cpu_count) + if self.parallel: + self.cpu_count = mp.cpu_count() + self.pool = mp.Pool(processes=self.cpu_count) self.clean = clean self.temp = tempfile.mkdtemp() def split(self): """Splits file into single page pdfs. """ + logger.info('Splitting pages...') infile = PdfFileReader(open(self.pdfname, 'rb'), strict=False) for p in self.pagenos: sp_path = os.path.join(self.temp, 'page-{0}.pdf'.format(p)) diff --git a/camelot/stream.py b/camelot/stream.py index c3089ca..7d5abe8 100644 --- a/camelot/stream.py +++ b/camelot/stream.py @@ -13,6 +13,8 @@ from .utils import (text_in_bbox, get_table_index, get_score, count_empty, __all__ = ['Stream'] +logger = logging.getLogger("app_logger") + def _reduce_method(m): if m.im_self is None: @@ -299,8 +301,9 @@ class Stream: ltchar = get_text_objects(layout, ltype="char") width, height = dim bname, __ = os.path.splitext(pdfname) + logger.info('Parsing tables from {0}'.format(bname)) if not lttextlh: - logging.warning("{0}: PDF has no text. It may be an image.".format( + logger.warning("{0}: PDF has no text. It may be an image.".format( os.path.basename(bname))) return {os.path.basename(bname): None} @@ -329,15 +332,14 @@ class Stream: table_bbox = {(0, 0, width, height): None} if len(self.ytol) == 1 and self.ytol[0] == 2: - self.ytol = self.ytol * len(table_bbox) + ytolerance = self.ytol * len(table_bbox) if len(self.mtol) == 1 and self.mtol[0] == 0: - self.mtol = self.mtol * len(table_bbox) + mtolerance = self.mtol * len(table_bbox) page = {} tables = {} - table_no = 0 # sort tables based on y-coord - for k in sorted(table_bbox.keys(), key=lambda x: x[1], reverse=True): + for table_no, k in enumerate(sorted(table_bbox.keys(), key=lambda x: x[1], reverse=True)): # select elements which lie within table_bbox table_data = {} t_bbox = {} @@ -348,7 +350,7 @@ class Stream: for direction in t_bbox: t_bbox[direction].sort(key=lambda x: (-x.y0, x.x0)) text_x_min, text_y_min, text_x_max, text_y_max = _text_bbox(t_bbox) - rows_grouped = _group_rows(t_bbox['horizontal'], ytol=self.ytol[table_no]) + rows_grouped = _group_rows(t_bbox['horizontal'], ytol=ytolerance[table_no]) rows = _join_rows(rows_grouped, text_y_max, text_y_min) elements = [len(r) for r in rows_grouped] @@ -369,13 +371,13 @@ class Stream: len_non_mode = len(filter(lambda x: x != ncols, elements)) if ncols == 1 and not self.debug: # no tables detected - logging.warning("{}: Only one column was detected, the pdf" - " may have no tables. Specify ncols if" - " the pdf has tables.".format( + logger.warning("{}: Only one column was detected, the pdf" + " may have no tables. Specify ncols if" + " the pdf has tables.".format( os.path.basename(bname))) cols = [(t.x0, t.x1) for r in rows_grouped if len(r) == ncols for t in r] - cols = _merge_columns(sorted(cols), mtol=self.mtol[table_no]) + cols = _merge_columns(sorted(cols), mtol=mtolerance[table_no]) inner_text = [] for i in range(1, len(cols)): left = cols[i - 1][1] @@ -387,15 +389,15 @@ class Stream: for t in t_bbox[direction] if t.x0 > cols[-1][1] or t.x1 < cols[0][0]] inner_text.extend(outer_text) - cols = _add_columns(cols, inner_text, self.ytol[table_no]) + cols = _add_columns(cols, inner_text, ytolerance[table_no]) cols = _join_columns(cols, text_x_min, text_x_max) if self.headers is not None and self.headers[table_no] != [""]: self.headers[table_no] = self.headers[table_no].split(',') if len(self.headers[table_no]) != len(cols): - logging.warning("Length of header ({0}) specified for table is not" - " equal to the number of columns ({1}) detected.".format( - len(self.headers[table_no]), len(cols))) + logger.warning("Length of header ({0}) specified for table is not" + " equal to the number of columns ({1}) detected.".format( + len(self.headers[table_no]), len(cols))) while len(self.headers[table_no]) != len(cols): self.headers[table_no].append('') @@ -434,7 +436,6 @@ class Stream: table_data['nrows'] = len(ar) table_data['ncols'] = len(ar[0]) tables['table-{0}'.format(table_no + 1)] = table_data - table_no += 1 page[os.path.basename(bname)] = tables return page \ No newline at end of file diff --git a/camelot/utils.py b/camelot/utils.py index 5134579..a1610cb 100644 --- a/camelot/utils.py +++ b/camelot/utils.py @@ -178,6 +178,33 @@ def scale_to_pdf(tables, v_segments, h_segments, factors): return tables_new, v_segments_new, h_segments_new +def setup_logging(log_filepath): + """Setup logging + Args: + log_filepath (string): Path to log file + Returns: + logging.Logger: Logger object + """ + logger = logging.getLogger("app_logger") + logger.setLevel(logging.DEBUG) + # Log File Handler (Associating one log file per webservice run) + log_file_handler = logging.FileHandler(log_filepath, + mode='a', + encoding='utf-8') + log_file_handler.setLevel(logging.DEBUG) + format_string = '%(asctime)s - %(levelname)s - %(funcName)s - %(message)s' + formatter = logging.Formatter(format_string, datefmt='%Y-%m-%dT%H:%M:%S') + log_file_handler.setFormatter(formatter) + logger.addHandler(log_file_handler) + # Stream Log Handler (For console) + stream_log_handler = logging.StreamHandler() + stream_log_handler.setLevel(logging.INFO) + formatter = logging.Formatter(format_string, datefmt='%Y-%m-%dT%H:%M:%S') + stream_log_handler.setFormatter(formatter) + logger.addHandler(stream_log_handler) + return logger + + def get_rotation(lttextlh, lttextlv, ltchar): """Detects if text in table is vertical or not using the current transformation matrix (CTM) and returns its orientation. diff --git a/tools/camelot b/tools/camelot index 837072f..f32c841 100755 --- a/tools/camelot +++ b/tools/camelot @@ -4,7 +4,6 @@ import os import sys import glob import time -import logging import zipfile import warnings @@ -18,6 +17,7 @@ from camelot.pdf import Pdf from camelot.lattice import Lattice from camelot.stream import Stream from camelot.ocr import OCR +from camelot import utils doc = """ @@ -34,7 +34,7 @@ options: Example: -p 1,3-6,10 [default: 1] -P, --parallel Parallelize the parsing process. -f, --format Output format. (csv,tsv,html,json,xlsx) [default: csv] - -l, --log Log to file. + -l, --log Log to file. -o, --output Output directory. -M, --cmargin Char margin. Chars closer than cmargin are grouped together to form a word. [default: 1.0] @@ -349,12 +349,11 @@ if __name__ == '__main__': scorename = ''.join([scorename, '_info.csv']) pngname, __ = os.path.splitext(filename) - if args['--log']: - FORMAT = '%(asctime)s - %(levelname)s - %(message)s' - if args['--output']: - logname = os.path.join(args['--output'], os.path.basename(logname)) - logging.basicConfig( - filename=logname, filemode='w', format=FORMAT, level=logging.DEBUG) + FORMAT = '%(asctime)s - %(levelname)s - %(message)s' + if args['--log'] is not None: + logger = utils.setup_logging(args['--log']) + else: + logger = utils.setup_logging(os.path.join(os.getcwd(), 'camelot.log')) p = [] if args['--pages'] == '1': @@ -383,7 +382,7 @@ if __name__ == '__main__': manager = Pdf(Lattice(table_area=tarea, fill=fill, headers=header, mtol=mtol, scale=int(args['--scale']), invert=args['--invert'], margins=margins, - split_text=args['--split_text'], flag_size=['--flag_size'], + split_text=args['--split_text'], flag_size=args['--flag_size'], shift_text=shift_text, debug=args['--debug']), filename, pagenos=p, @@ -393,7 +392,7 @@ if __name__ == '__main__': processing_time = time.time() - start_time vprint("Finished processing in", processing_time, "seconds") - logging.info("Finished processing in " + str(processing_time) + " seconds") + logger.info("Finished processing in " + str(processing_time) + " seconds") if args['--plot']: if args['--output']: @@ -439,7 +438,7 @@ if __name__ == '__main__': if args['--debug']: manager.debug_plot() except Exception as e: - logging.exception(e.message, exc_info=True) + logger.exception(e.message, exc_info=True) sys.exit() elif args[''] == 'stream': try: @@ -455,7 +454,7 @@ if __name__ == '__main__': manager = Pdf(Stream(table_area=tarea, columns=columns, ncolumns=ncolumns, headers=header, ytol=ytol, mtol=mtol, margins=margins, split_text=args['--split_text'], - flag_size=['--flag_size'], debug=args['--debug']), + flag_size=args['--flag_size'], debug=args['--debug']), filename, pagenos=p, parallel=args['--parallel'], @@ -464,7 +463,7 @@ if __name__ == '__main__': processing_time = time.time() - start_time vprint("Finished processing in", processing_time, "seconds") - logging.info("Finished processing in " + str(processing_time) + " seconds") + logger.info("Finished processing in " + str(processing_time) + " seconds") if args['--plot']: if args['--output']: @@ -509,7 +508,7 @@ if __name__ == '__main__': if args['--debug']: manager.debug_plot() except Exception as e: - logging.exception(e.message, exc_info=True) + logger.exception(e.message, exc_info=True) sys.exit() elif args[''] == 'ocr': try: @@ -526,7 +525,7 @@ if __name__ == '__main__': processing_time = time.time() - start_time vprint("Finished processing in", processing_time, "seconds") - logging.info("Finished processing in " + str(processing_time) + " seconds") + logger.info("Finished processing in " + str(processing_time) + " seconds") if args['--plot']: if args['--output']: @@ -572,7 +571,7 @@ if __name__ == '__main__': if args['--debug']: manager.debug_plot() except Exception as e: - logging.exception(e.message, exc_info=True) + logger.exception(e.message, exc_info=True) sys.exit() if args['--debug']: