From 3045a926301d11bd246c43532cbb984c1ab33f24 Mon Sep 17 00:00:00 2001 From: Vinayak Mehta Date: Tue, 12 Jul 2016 17:44:08 +0530 Subject: [PATCH] Add support for pdfminer LAParams --- basic.py | 5 +++-- camelot.py | 21 ++++++++++++++------- pdf.py | 6 ++++-- spreadsheet.py | 6 ++++-- 4 files changed, 25 insertions(+), 13 deletions(-) diff --git a/basic.py b/basic.py index e7ce1f6..e2ff777 100644 --- a/basic.py +++ b/basic.py @@ -33,9 +33,10 @@ def get_column_idx(t, columns): return c -def basic(pdf_dir, filename): +def basic(pdf_dir, filename, char_margin, line_margin, word_margin): print "working on", filename - text, _, _ = get_pdf_info(os.path.join(pdf_dir, filename), 'basic') + text, _, _ = get_pdf_info(os.path.join(pdf_dir, filename), 'basic', + char_margin, line_margin, word_margin) text.sort(key=lambda x: (-x.y0, x.x0)) y_last = 0 data = [] diff --git a/camelot.py b/camelot.py index ec8fa96..7efd914 100644 --- a/camelot.py +++ b/camelot.py @@ -36,20 +36,26 @@ parser.add_argument('-p', '--pages', nargs='+', action='store', dest='pages', help='Specify the page numbers and/or page ranges to be parsed. Example: -p="1 3-5 9", -p="all" (default: 1)') parser.add_argument('-f', '--format', nargs=1, action='store', dest='format', help='Output format (csv/xlsx). Example: -f="xlsx" (default: csv)', default=["csv"]) -parser.add_argument('-m', '--spreadsheet', action='store_true', dest='spreadsheet', +parser.add_argument('-s', '--spreadsheet', action='store_true', dest='spreadsheet', help='Extract tables with ruling lines. (default: False)') -parser.add_argument('-F', '--fill', action='store', dest='fill', +parser.add_argument('-i', '--fill', action='store', dest='fill', help='Fill the values in empty cells horizontally(h) and/or vertically(v). Example: -F="h", -F="v", -F="hv" (default: None)', default=None) -parser.add_argument('-s', '--scale', nargs='?', action='store', dest='scale', +parser.add_argument('-c', '--scale', nargs='?', action='store', dest='scale', help='Scaling factor. Large scaling factor leads to smaller lines being detected. (default: 15)', default=15, type=int) parser.add_argument('-j', '--jtol', nargs='?', action='store', dest='jtol', help='Tolerance to account for when comparing joint and line coordinates. (default: 2)', default=2, type=int) -parser.add_argument('-M', '--mtol', nargs='?', action='store', +parser.add_argument('-t', '--mtol', nargs='?', action='store', dest='mtol', help='Tolerance to account for when merging lines which are very close. (default: 2)', default=2, type=int) -parser.add_argument('-i', '--invert', action='store_true', dest='invert', +parser.add_argument('-n', '--invert', action='store_true', dest='invert', help='Make sure lines are in foreground. (default: False)') parser.add_argument('-d', '--debug', nargs=1, action='store', dest='debug', help='Debug by visualizing contours, lines, joints, tables. Example: --debug="contours"') +parser.add_argument('-M', '--char-margin', nargs='?', action='store', dest='char_margin', + help='(default: 2.0)', default=2.0, type=float) +parser.add_argument('-L', '--line-margin', nargs='?', action='store', dest='line_margin', + help='(default: 0.5)', default=0.5, type=float) +parser.add_argument('-W', '--word-margin', nargs='?', action='store', dest='word_margin', + help='(default: 0.1)', default=0.1, type=float) parser.add_argument('-o', '--output', nargs=1, action='store', dest='output', help='Specify output directory.') parser.add_argument('file', nargs=1) @@ -98,14 +104,15 @@ if result.spreadsheet: g, '-depth', '8', g[:-4] + '.png'])) try: spreadsheet(pdf_dir, g.split('/')[-1], result.fill, result.scale, - result.jtol, result.mtol, result.invert, result.debug) + result.jtol, result.mtol, result.invert, result.debug, + result.char_margin, result.line_margin, result.word_margin) except: logging.error("Couldn't parse " + g.split('/')[-1]) print "Couldn't parse", g.split('/')[-1] else: print "using the basic method" for g in sorted(glob.glob(os.path.join(pdf_dir, 'pg-*.pdf'))): - basic(pdf_dir, g.split('/')[-1]) + basic(pdf_dir, g.split('/')[-1], result.char_margin, result.line_margin, result.word_margin) if result.format == ['xlsx']: import csv diff --git a/pdf.py b/pdf.py index 3c603e0..136904c 100644 --- a/pdf.py +++ b/pdf.py @@ -37,13 +37,15 @@ def parse_text_spreadsheet(layout, t=None): return t -def get_pdf_info(pdfname, method): +def get_pdf_info(pdfname, method, char_margin, line_margin, word_margin): with open(pdfname, 'r') as f: parser = PDFParser(f) document = PDFDocument(parser) if not document.is_extractable: raise PDFTextExtractionNotAllowed - laparams = LAParams() + laparams = LAParams(char_margin=char_margin, + line_margin=line_margin, + word_margin=word_margin) rsrcmgr = PDFResourceManager() device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) diff --git a/spreadsheet.py b/spreadsheet.py index 8d04a4c..46ea466 100644 --- a/spreadsheet.py +++ b/spreadsheet.py @@ -11,7 +11,8 @@ from utils import (translate, scale, merge_close_values, get_row_idx, get_column_idx, reduce_index, outline, fill, remove_empty) -def spreadsheet(pdf_dir, filename, fill, s, jtol, mtol, invert, debug): +def spreadsheet(pdf_dir, filename, fill, s, jtol, mtol, invert, debug, + char_margin, line_margin, word_margin): if debug: import matplotlib.pyplot as plt import matplotlib.patches as patches @@ -20,7 +21,8 @@ def spreadsheet(pdf_dir, filename, fill, s, jtol, mtol, invert, debug): img = cv2.imread(imagename) img_x, img_y = img.shape[1], img.shape[0] text, pdf_x, pdf_y = get_pdf_info( - os.path.join(pdf_dir, filename), 'spreadsheet') + os.path.join(pdf_dir, filename), 'spreadsheet', + char_margin, line_margin, word_margin) scaling_factor_x = pdf_x / float(img_x) scaling_factor_y = pdf_y / float(img_y) tables, v_segments, h_segments = morph_transform(imagename, s, invert)