Add support for pdfminer LAParams

2016-07-12 17:44:08 +05:30 · 2016-07-12 17:44:08 +05:30 · 3045a92630
parent 2ef3cc7651
commit 3045a92630
4 changed files with 25 additions and 13 deletions
--- a/basic.py
+++ b/basic.py
@ -33,9 +33,10 @@ def get_column_idx(t, columns):
            return c


-def basic(pdf_dir, filename):
+def basic(pdf_dir, filename, char_margin, line_margin, word_margin):
    print "working on", filename
-    text, _, _ = get_pdf_info(os.path.join(pdf_dir, filename), 'basic')
+    text, _, _ = get_pdf_info(os.path.join(pdf_dir, filename), 'basic',
+                              char_margin, line_margin, word_margin)
    text.sort(key=lambda x: (-x.y0, x.x0))
    y_last = 0
    data = []
--- a/camelot.py
+++ b/camelot.py
@ -36,20 +36,26 @@ parser.add_argument('-p', '--pages', nargs='+', action='store', dest='pages',
                    help='Specify the page numbers and/or page ranges to be parsed. Example: -p="1 3-5 9", -p="all" (default: 1)')
 parser.add_argument('-f', '--format', nargs=1, action='store', dest='format',
                    help='Output format (csv/xlsx). Example: -f="xlsx" (default: csv)', default=["csv"])
-parser.add_argument('-m', '--spreadsheet', action='store_true', dest='spreadsheet',
+parser.add_argument('-s', '--spreadsheet', action='store_true', dest='spreadsheet',
                    help='Extract tables with ruling lines. (default: False)')
-parser.add_argument('-F', '--fill', action='store', dest='fill',
+parser.add_argument('-i', '--fill', action='store', dest='fill',
                    help='Fill the values in empty cells horizontally(h) and/or vertically(v). Example: -F="h", -F="v", -F="hv" (default: None)', default=None)
-parser.add_argument('-s', '--scale', nargs='?', action='store', dest='scale',
+parser.add_argument('-c', '--scale', nargs='?', action='store', dest='scale',
                    help='Scaling factor. Large scaling factor leads to smaller lines being detected. (default: 15)', default=15, type=int)
 parser.add_argument('-j', '--jtol', nargs='?', action='store',
                    dest='jtol', help='Tolerance to account for when comparing joint and line coordinates. (default: 2)', default=2, type=int)
-parser.add_argument('-M', '--mtol', nargs='?', action='store',
+parser.add_argument('-t', '--mtol', nargs='?', action='store',
                    dest='mtol', help='Tolerance to account for when merging lines which are very close. (default: 2)', default=2, type=int)
-parser.add_argument('-i', '--invert', action='store_true', dest='invert',
+parser.add_argument('-n', '--invert', action='store_true', dest='invert',
                    help='Make sure lines are in foreground. (default: False)')
 parser.add_argument('-d', '--debug', nargs=1, action='store', dest='debug',
                    help='Debug by visualizing contours, lines, joints, tables. Example: --debug="contours"')
+parser.add_argument('-M', '--char-margin', nargs='?', action='store', dest='char_margin',
+                    help='(default: 2.0)', default=2.0, type=float)
+parser.add_argument('-L', '--line-margin', nargs='?', action='store', dest='line_margin',
+                    help='(default: 0.5)', default=0.5, type=float)
+parser.add_argument('-W', '--word-margin', nargs='?', action='store', dest='word_margin',
+                    help='(default: 0.1)', default=0.1, type=float)
 parser.add_argument('-o', '--output', nargs=1, action='store', dest='output',
                    help='Specify output directory.')
 parser.add_argument('file', nargs=1)
@ -98,14 +104,15 @@ if result.spreadsheet:
                            g, '-depth', '8', g[:-4] + '.png']))
        try:
            spreadsheet(pdf_dir, g.split('/')[-1], result.fill, result.scale,
-                        result.jtol, result.mtol, result.invert, result.debug)
+                        result.jtol, result.mtol, result.invert, result.debug,
+                        result.char_margin, result.line_margin, result.word_margin)
        except:
          logging.error("Couldn't parse " + g.split('/')[-1])
          print "Couldn't parse", g.split('/')[-1]
 else:
    print "using the basic method"
    for g in sorted(glob.glob(os.path.join(pdf_dir, 'pg-*.pdf'))):
-        basic(pdf_dir, g.split('/')[-1])
+        basic(pdf_dir, g.split('/')[-1], result.char_margin, result.line_margin, result.word_margin)

 if result.format == ['xlsx']:
    import csv
--- a/pdf.py
+++ b/pdf.py
@ -37,13 +37,15 @@ def parse_text_spreadsheet(layout, t=None):
    return t


-def get_pdf_info(pdfname, method):
+def get_pdf_info(pdfname, method, char_margin, line_margin, word_margin):
    with open(pdfname, 'r') as f:
        parser = PDFParser(f)
        document = PDFDocument(parser)
        if not document.is_extractable:
            raise PDFTextExtractionNotAllowed
-        laparams = LAParams()
+        laparams = LAParams(char_margin=char_margin,
+                            line_margin=line_margin,
+                            word_margin=word_margin)
        rsrcmgr = PDFResourceManager()
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)
--- a/spreadsheet.py
+++ b/spreadsheet.py
@ -11,7 +11,8 @@ from utils import (translate, scale, merge_close_values, get_row_idx,
                   get_column_idx, reduce_index, outline, fill, remove_empty)


-def spreadsheet(pdf_dir, filename, fill, s, jtol, mtol, invert, debug):
+def spreadsheet(pdf_dir, filename, fill, s, jtol, mtol, invert, debug,
+                char_margin, line_margin, word_margin):
    if debug:
        import matplotlib.pyplot as plt
        import matplotlib.patches as patches
@ -20,7 +21,8 @@ def spreadsheet(pdf_dir, filename, fill, s, jtol, mtol, invert, debug):
    img = cv2.imread(imagename)
    img_x, img_y = img.shape[1], img.shape[0]
    text, pdf_x, pdf_y = get_pdf_info(
-        os.path.join(pdf_dir, filename), 'spreadsheet')
+        os.path.join(pdf_dir, filename), 'spreadsheet',
+        char_margin, line_margin, word_margin)
    scaling_factor_x = pdf_x / float(img_x)
    scaling_factor_y = pdf_y / float(img_y)
    tables, v_segments, h_segments = morph_transform(imagename, s, invert)