From 3045a926301d11bd246c43532cbb984c1ab33f24 Mon Sep 17 00:00:00 2001
From: Vinayak Mehta <vmehta94@gmail.com>
Date: Tue, 12 Jul 2016 17:44:08 +0530
Subject: [PATCH] Add support for pdfminer LAParams

---
 basic.py       |  5 +++--
 camelot.py     | 21 ++++++++++++++-------
 pdf.py         |  6 ++++--
 spreadsheet.py |  6 ++++--
 4 files changed, 25 insertions(+), 13 deletions(-)

diff --git a/basic.py b/basic.py
index e7ce1f6..e2ff777 100644
--- a/basic.py
+++ b/basic.py
@@ -33,9 +33,10 @@ def get_column_idx(t, columns):
             return c
 
 
-def basic(pdf_dir, filename):
+def basic(pdf_dir, filename, char_margin, line_margin, word_margin):
     print "working on", filename
-    text, _, _ = get_pdf_info(os.path.join(pdf_dir, filename), 'basic')
+    text, _, _ = get_pdf_info(os.path.join(pdf_dir, filename), 'basic',
+                              char_margin, line_margin, word_margin)
     text.sort(key=lambda x: (-x.y0, x.x0))
     y_last = 0
     data = []
diff --git a/camelot.py b/camelot.py
index ec8fa96..7efd914 100644
--- a/camelot.py
+++ b/camelot.py
@@ -36,20 +36,26 @@ parser.add_argument('-p', '--pages', nargs='+', action='store', dest='pages',
                     help='Specify the page numbers and/or page ranges to be parsed. Example: -p="1 3-5 9", -p="all" (default: 1)')
 parser.add_argument('-f', '--format', nargs=1, action='store', dest='format',
                     help='Output format (csv/xlsx). Example: -f="xlsx" (default: csv)', default=["csv"])
-parser.add_argument('-m', '--spreadsheet', action='store_true', dest='spreadsheet',
+parser.add_argument('-s', '--spreadsheet', action='store_true', dest='spreadsheet',
                     help='Extract tables with ruling lines. (default: False)')
-parser.add_argument('-F', '--fill', action='store', dest='fill',
+parser.add_argument('-i', '--fill', action='store', dest='fill',
                     help='Fill the values in empty cells horizontally(h) and/or vertically(v). Example: -F="h", -F="v", -F="hv" (default: None)', default=None)
-parser.add_argument('-s', '--scale', nargs='?', action='store', dest='scale',
+parser.add_argument('-c', '--scale', nargs='?', action='store', dest='scale',
                     help='Scaling factor. Large scaling factor leads to smaller lines being detected. (default: 15)', default=15, type=int)
 parser.add_argument('-j', '--jtol', nargs='?', action='store',
                     dest='jtol', help='Tolerance to account for when comparing joint and line coordinates. (default: 2)', default=2, type=int)
-parser.add_argument('-M', '--mtol', nargs='?', action='store',
+parser.add_argument('-t', '--mtol', nargs='?', action='store',
                     dest='mtol', help='Tolerance to account for when merging lines which are very close. (default: 2)', default=2, type=int)
-parser.add_argument('-i', '--invert', action='store_true', dest='invert',
+parser.add_argument('-n', '--invert', action='store_true', dest='invert',
                     help='Make sure lines are in foreground. (default: False)')
 parser.add_argument('-d', '--debug', nargs=1, action='store', dest='debug',
                     help='Debug by visualizing contours, lines, joints, tables. Example: --debug="contours"')
+parser.add_argument('-M', '--char-margin', nargs='?', action='store', dest='char_margin',
+                    help='(default: 2.0)', default=2.0, type=float)
+parser.add_argument('-L', '--line-margin', nargs='?', action='store', dest='line_margin',
+                    help='(default: 0.5)', default=0.5, type=float)
+parser.add_argument('-W', '--word-margin', nargs='?', action='store', dest='word_margin',
+                    help='(default: 0.1)', default=0.1, type=float)
 parser.add_argument('-o', '--output', nargs=1, action='store', dest='output',
                     help='Specify output directory.')
 parser.add_argument('file', nargs=1)
@@ -98,14 +104,15 @@ if result.spreadsheet:
                             g, '-depth', '8', g[:-4] + '.png']))
         try:
             spreadsheet(pdf_dir, g.split('/')[-1], result.fill, result.scale,
-                        result.jtol, result.mtol, result.invert, result.debug)
+                        result.jtol, result.mtol, result.invert, result.debug,
+                        result.char_margin, result.line_margin, result.word_margin)
         except:
           logging.error("Couldn't parse " + g.split('/')[-1])
           print "Couldn't parse", g.split('/')[-1]
 else:
     print "using the basic method"
     for g in sorted(glob.glob(os.path.join(pdf_dir, 'pg-*.pdf'))):
-        basic(pdf_dir, g.split('/')[-1])
+        basic(pdf_dir, g.split('/')[-1], result.char_margin, result.line_margin, result.word_margin)
 
 if result.format == ['xlsx']:
     import csv
diff --git a/pdf.py b/pdf.py
index 3c603e0..136904c 100644
--- a/pdf.py
+++ b/pdf.py
@@ -37,13 +37,15 @@ def parse_text_spreadsheet(layout, t=None):
     return t
 
 
-def get_pdf_info(pdfname, method):
+def get_pdf_info(pdfname, method, char_margin, line_margin, word_margin):
     with open(pdfname, 'r') as f:
         parser = PDFParser(f)
         document = PDFDocument(parser)
         if not document.is_extractable:
             raise PDFTextExtractionNotAllowed
-        laparams = LAParams()
+        laparams = LAParams(char_margin=char_margin,
+                            line_margin=line_margin,
+                            word_margin=word_margin)
         rsrcmgr = PDFResourceManager()
         device = PDFPageAggregator(rsrcmgr, laparams=laparams)
         interpreter = PDFPageInterpreter(rsrcmgr, device)
diff --git a/spreadsheet.py b/spreadsheet.py
index 8d04a4c..46ea466 100644
--- a/spreadsheet.py
+++ b/spreadsheet.py
@@ -11,7 +11,8 @@ from utils import (translate, scale, merge_close_values, get_row_idx,
                    get_column_idx, reduce_index, outline, fill, remove_empty)
 
 
-def spreadsheet(pdf_dir, filename, fill, s, jtol, mtol, invert, debug):
+def spreadsheet(pdf_dir, filename, fill, s, jtol, mtol, invert, debug,
+                char_margin, line_margin, word_margin):
     if debug:
         import matplotlib.pyplot as plt
         import matplotlib.patches as patches
@@ -20,7 +21,8 @@ def spreadsheet(pdf_dir, filename, fill, s, jtol, mtol, invert, debug):
     img = cv2.imread(imagename)
     img_x, img_y = img.shape[1], img.shape[0]
     text, pdf_x, pdf_y = get_pdf_info(
-        os.path.join(pdf_dir, filename), 'spreadsheet')
+        os.path.join(pdf_dir, filename), 'spreadsheet',
+        char_margin, line_margin, word_margin)
     scaling_factor_x = pdf_x / float(img_x)
     scaling_factor_y = pdf_y / float(img_y)
     tables, v_segments, h_segments = morph_transform(imagename, s, invert)