Add support for pdfminer LAParams
parent
2ef3cc7651
commit
3045a92630
5
basic.py
5
basic.py
|
|
@ -33,9 +33,10 @@ def get_column_idx(t, columns):
|
|||
return c
|
||||
|
||||
|
||||
def basic(pdf_dir, filename):
|
||||
def basic(pdf_dir, filename, char_margin, line_margin, word_margin):
|
||||
print "working on", filename
|
||||
text, _, _ = get_pdf_info(os.path.join(pdf_dir, filename), 'basic')
|
||||
text, _, _ = get_pdf_info(os.path.join(pdf_dir, filename), 'basic',
|
||||
char_margin, line_margin, word_margin)
|
||||
text.sort(key=lambda x: (-x.y0, x.x0))
|
||||
y_last = 0
|
||||
data = []
|
||||
|
|
|
|||
21
camelot.py
21
camelot.py
|
|
@ -36,20 +36,26 @@ parser.add_argument('-p', '--pages', nargs='+', action='store', dest='pages',
|
|||
help='Specify the page numbers and/or page ranges to be parsed. Example: -p="1 3-5 9", -p="all" (default: 1)')
|
||||
parser.add_argument('-f', '--format', nargs=1, action='store', dest='format',
|
||||
help='Output format (csv/xlsx). Example: -f="xlsx" (default: csv)', default=["csv"])
|
||||
parser.add_argument('-m', '--spreadsheet', action='store_true', dest='spreadsheet',
|
||||
parser.add_argument('-s', '--spreadsheet', action='store_true', dest='spreadsheet',
|
||||
help='Extract tables with ruling lines. (default: False)')
|
||||
parser.add_argument('-F', '--fill', action='store', dest='fill',
|
||||
parser.add_argument('-i', '--fill', action='store', dest='fill',
|
||||
help='Fill the values in empty cells horizontally(h) and/or vertically(v). Example: -F="h", -F="v", -F="hv" (default: None)', default=None)
|
||||
parser.add_argument('-s', '--scale', nargs='?', action='store', dest='scale',
|
||||
parser.add_argument('-c', '--scale', nargs='?', action='store', dest='scale',
|
||||
help='Scaling factor. Large scaling factor leads to smaller lines being detected. (default: 15)', default=15, type=int)
|
||||
parser.add_argument('-j', '--jtol', nargs='?', action='store',
|
||||
dest='jtol', help='Tolerance to account for when comparing joint and line coordinates. (default: 2)', default=2, type=int)
|
||||
parser.add_argument('-M', '--mtol', nargs='?', action='store',
|
||||
parser.add_argument('-t', '--mtol', nargs='?', action='store',
|
||||
dest='mtol', help='Tolerance to account for when merging lines which are very close. (default: 2)', default=2, type=int)
|
||||
parser.add_argument('-i', '--invert', action='store_true', dest='invert',
|
||||
parser.add_argument('-n', '--invert', action='store_true', dest='invert',
|
||||
help='Make sure lines are in foreground. (default: False)')
|
||||
parser.add_argument('-d', '--debug', nargs=1, action='store', dest='debug',
|
||||
help='Debug by visualizing contours, lines, joints, tables. Example: --debug="contours"')
|
||||
parser.add_argument('-M', '--char-margin', nargs='?', action='store', dest='char_margin',
|
||||
help='(default: 2.0)', default=2.0, type=float)
|
||||
parser.add_argument('-L', '--line-margin', nargs='?', action='store', dest='line_margin',
|
||||
help='(default: 0.5)', default=0.5, type=float)
|
||||
parser.add_argument('-W', '--word-margin', nargs='?', action='store', dest='word_margin',
|
||||
help='(default: 0.1)', default=0.1, type=float)
|
||||
parser.add_argument('-o', '--output', nargs=1, action='store', dest='output',
|
||||
help='Specify output directory.')
|
||||
parser.add_argument('file', nargs=1)
|
||||
|
|
@ -98,14 +104,15 @@ if result.spreadsheet:
|
|||
g, '-depth', '8', g[:-4] + '.png']))
|
||||
try:
|
||||
spreadsheet(pdf_dir, g.split('/')[-1], result.fill, result.scale,
|
||||
result.jtol, result.mtol, result.invert, result.debug)
|
||||
result.jtol, result.mtol, result.invert, result.debug,
|
||||
result.char_margin, result.line_margin, result.word_margin)
|
||||
except:
|
||||
logging.error("Couldn't parse " + g.split('/')[-1])
|
||||
print "Couldn't parse", g.split('/')[-1]
|
||||
else:
|
||||
print "using the basic method"
|
||||
for g in sorted(glob.glob(os.path.join(pdf_dir, 'pg-*.pdf'))):
|
||||
basic(pdf_dir, g.split('/')[-1])
|
||||
basic(pdf_dir, g.split('/')[-1], result.char_margin, result.line_margin, result.word_margin)
|
||||
|
||||
if result.format == ['xlsx']:
|
||||
import csv
|
||||
|
|
|
|||
6
pdf.py
6
pdf.py
|
|
@ -37,13 +37,15 @@ def parse_text_spreadsheet(layout, t=None):
|
|||
return t
|
||||
|
||||
|
||||
def get_pdf_info(pdfname, method):
|
||||
def get_pdf_info(pdfname, method, char_margin, line_margin, word_margin):
|
||||
with open(pdfname, 'r') as f:
|
||||
parser = PDFParser(f)
|
||||
document = PDFDocument(parser)
|
||||
if not document.is_extractable:
|
||||
raise PDFTextExtractionNotAllowed
|
||||
laparams = LAParams()
|
||||
laparams = LAParams(char_margin=char_margin,
|
||||
line_margin=line_margin,
|
||||
word_margin=word_margin)
|
||||
rsrcmgr = PDFResourceManager()
|
||||
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
|
||||
interpreter = PDFPageInterpreter(rsrcmgr, device)
|
||||
|
|
|
|||
|
|
@ -11,7 +11,8 @@ from utils import (translate, scale, merge_close_values, get_row_idx,
|
|||
get_column_idx, reduce_index, outline, fill, remove_empty)
|
||||
|
||||
|
||||
def spreadsheet(pdf_dir, filename, fill, s, jtol, mtol, invert, debug):
|
||||
def spreadsheet(pdf_dir, filename, fill, s, jtol, mtol, invert, debug,
|
||||
char_margin, line_margin, word_margin):
|
||||
if debug:
|
||||
import matplotlib.pyplot as plt
|
||||
import matplotlib.patches as patches
|
||||
|
|
@ -20,7 +21,8 @@ def spreadsheet(pdf_dir, filename, fill, s, jtol, mtol, invert, debug):
|
|||
img = cv2.imread(imagename)
|
||||
img_x, img_y = img.shape[1], img.shape[0]
|
||||
text, pdf_x, pdf_y = get_pdf_info(
|
||||
os.path.join(pdf_dir, filename), 'spreadsheet')
|
||||
os.path.join(pdf_dir, filename), 'spreadsheet',
|
||||
char_margin, line_margin, word_margin)
|
||||
scaling_factor_x = pdf_x / float(img_x)
|
||||
scaling_factor_y = pdf_y / float(img_y)
|
||||
tables, v_segments, h_segments = morph_transform(imagename, s, invert)
|
||||
|
|
|
|||
Loading…
Reference in New Issue