Add support for pdfminer LAParams

pull/2/head
Vinayak Mehta 2016-07-12 17:44:08 +05:30
parent 2ef3cc7651
commit 3045a92630
4 changed files with 25 additions and 13 deletions

View File

@ -33,9 +33,10 @@ def get_column_idx(t, columns):
return c return c
def basic(pdf_dir, filename): def basic(pdf_dir, filename, char_margin, line_margin, word_margin):
print "working on", filename print "working on", filename
text, _, _ = get_pdf_info(os.path.join(pdf_dir, filename), 'basic') text, _, _ = get_pdf_info(os.path.join(pdf_dir, filename), 'basic',
char_margin, line_margin, word_margin)
text.sort(key=lambda x: (-x.y0, x.x0)) text.sort(key=lambda x: (-x.y0, x.x0))
y_last = 0 y_last = 0
data = [] data = []

View File

@ -36,20 +36,26 @@ parser.add_argument('-p', '--pages', nargs='+', action='store', dest='pages',
help='Specify the page numbers and/or page ranges to be parsed. Example: -p="1 3-5 9", -p="all" (default: 1)') help='Specify the page numbers and/or page ranges to be parsed. Example: -p="1 3-5 9", -p="all" (default: 1)')
parser.add_argument('-f', '--format', nargs=1, action='store', dest='format', parser.add_argument('-f', '--format', nargs=1, action='store', dest='format',
help='Output format (csv/xlsx). Example: -f="xlsx" (default: csv)', default=["csv"]) help='Output format (csv/xlsx). Example: -f="xlsx" (default: csv)', default=["csv"])
parser.add_argument('-m', '--spreadsheet', action='store_true', dest='spreadsheet', parser.add_argument('-s', '--spreadsheet', action='store_true', dest='spreadsheet',
help='Extract tables with ruling lines. (default: False)') help='Extract tables with ruling lines. (default: False)')
parser.add_argument('-F', '--fill', action='store', dest='fill', parser.add_argument('-i', '--fill', action='store', dest='fill',
help='Fill the values in empty cells horizontally(h) and/or vertically(v). Example: -F="h", -F="v", -F="hv" (default: None)', default=None) help='Fill the values in empty cells horizontally(h) and/or vertically(v). Example: -F="h", -F="v", -F="hv" (default: None)', default=None)
parser.add_argument('-s', '--scale', nargs='?', action='store', dest='scale', parser.add_argument('-c', '--scale', nargs='?', action='store', dest='scale',
help='Scaling factor. Large scaling factor leads to smaller lines being detected. (default: 15)', default=15, type=int) help='Scaling factor. Large scaling factor leads to smaller lines being detected. (default: 15)', default=15, type=int)
parser.add_argument('-j', '--jtol', nargs='?', action='store', parser.add_argument('-j', '--jtol', nargs='?', action='store',
dest='jtol', help='Tolerance to account for when comparing joint and line coordinates. (default: 2)', default=2, type=int) dest='jtol', help='Tolerance to account for when comparing joint and line coordinates. (default: 2)', default=2, type=int)
parser.add_argument('-M', '--mtol', nargs='?', action='store', parser.add_argument('-t', '--mtol', nargs='?', action='store',
dest='mtol', help='Tolerance to account for when merging lines which are very close. (default: 2)', default=2, type=int) dest='mtol', help='Tolerance to account for when merging lines which are very close. (default: 2)', default=2, type=int)
parser.add_argument('-i', '--invert', action='store_true', dest='invert', parser.add_argument('-n', '--invert', action='store_true', dest='invert',
help='Make sure lines are in foreground. (default: False)') help='Make sure lines are in foreground. (default: False)')
parser.add_argument('-d', '--debug', nargs=1, action='store', dest='debug', parser.add_argument('-d', '--debug', nargs=1, action='store', dest='debug',
help='Debug by visualizing contours, lines, joints, tables. Example: --debug="contours"') help='Debug by visualizing contours, lines, joints, tables. Example: --debug="contours"')
parser.add_argument('-M', '--char-margin', nargs='?', action='store', dest='char_margin',
help='(default: 2.0)', default=2.0, type=float)
parser.add_argument('-L', '--line-margin', nargs='?', action='store', dest='line_margin',
help='(default: 0.5)', default=0.5, type=float)
parser.add_argument('-W', '--word-margin', nargs='?', action='store', dest='word_margin',
help='(default: 0.1)', default=0.1, type=float)
parser.add_argument('-o', '--output', nargs=1, action='store', dest='output', parser.add_argument('-o', '--output', nargs=1, action='store', dest='output',
help='Specify output directory.') help='Specify output directory.')
parser.add_argument('file', nargs=1) parser.add_argument('file', nargs=1)
@ -98,14 +104,15 @@ if result.spreadsheet:
g, '-depth', '8', g[:-4] + '.png'])) g, '-depth', '8', g[:-4] + '.png']))
try: try:
spreadsheet(pdf_dir, g.split('/')[-1], result.fill, result.scale, spreadsheet(pdf_dir, g.split('/')[-1], result.fill, result.scale,
result.jtol, result.mtol, result.invert, result.debug) result.jtol, result.mtol, result.invert, result.debug,
result.char_margin, result.line_margin, result.word_margin)
except: except:
logging.error("Couldn't parse " + g.split('/')[-1]) logging.error("Couldn't parse " + g.split('/')[-1])
print "Couldn't parse", g.split('/')[-1] print "Couldn't parse", g.split('/')[-1]
else: else:
print "using the basic method" print "using the basic method"
for g in sorted(glob.glob(os.path.join(pdf_dir, 'pg-*.pdf'))): for g in sorted(glob.glob(os.path.join(pdf_dir, 'pg-*.pdf'))):
basic(pdf_dir, g.split('/')[-1]) basic(pdf_dir, g.split('/')[-1], result.char_margin, result.line_margin, result.word_margin)
if result.format == ['xlsx']: if result.format == ['xlsx']:
import csv import csv

6
pdf.py
View File

@ -37,13 +37,15 @@ def parse_text_spreadsheet(layout, t=None):
return t return t
def get_pdf_info(pdfname, method): def get_pdf_info(pdfname, method, char_margin, line_margin, word_margin):
with open(pdfname, 'r') as f: with open(pdfname, 'r') as f:
parser = PDFParser(f) parser = PDFParser(f)
document = PDFDocument(parser) document = PDFDocument(parser)
if not document.is_extractable: if not document.is_extractable:
raise PDFTextExtractionNotAllowed raise PDFTextExtractionNotAllowed
laparams = LAParams() laparams = LAParams(char_margin=char_margin,
line_margin=line_margin,
word_margin=word_margin)
rsrcmgr = PDFResourceManager() rsrcmgr = PDFResourceManager()
device = PDFPageAggregator(rsrcmgr, laparams=laparams) device = PDFPageAggregator(rsrcmgr, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device) interpreter = PDFPageInterpreter(rsrcmgr, device)

View File

@ -11,7 +11,8 @@ from utils import (translate, scale, merge_close_values, get_row_idx,
get_column_idx, reduce_index, outline, fill, remove_empty) get_column_idx, reduce_index, outline, fill, remove_empty)
def spreadsheet(pdf_dir, filename, fill, s, jtol, mtol, invert, debug): def spreadsheet(pdf_dir, filename, fill, s, jtol, mtol, invert, debug,
char_margin, line_margin, word_margin):
if debug: if debug:
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
import matplotlib.patches as patches import matplotlib.patches as patches
@ -20,7 +21,8 @@ def spreadsheet(pdf_dir, filename, fill, s, jtol, mtol, invert, debug):
img = cv2.imread(imagename) img = cv2.imread(imagename)
img_x, img_y = img.shape[1], img.shape[0] img_x, img_y = img.shape[1], img.shape[0]
text, pdf_x, pdf_y = get_pdf_info( text, pdf_x, pdf_y = get_pdf_info(
os.path.join(pdf_dir, filename), 'spreadsheet') os.path.join(pdf_dir, filename), 'spreadsheet',
char_margin, line_margin, word_margin)
scaling_factor_x = pdf_x / float(img_x) scaling_factor_x = pdf_x / float(img_x)
scaling_factor_y = pdf_y / float(img_y) scaling_factor_y = pdf_y / float(img_y)
tables, v_segments, h_segments = morph_transform(imagename, s, invert) tables, v_segments, h_segments = morph_transform(imagename, s, invert)