Add verbose

pull/2/head
Vinayak Mehta 2016-08-03 03:40:53 +05:30
parent 57917426e8
commit 13568865b5
4 changed files with 22 additions and 9 deletions

View File

@ -1,3 +1,4 @@
from __future__ import print_function
import os
import cv2
@ -160,7 +161,7 @@ class Lattice:
"""
def __init__(self, pdfobject, fill=None, scale=15, jtol=2, mtol=2,
invert=False, debug=None):
invert=False, debug=None, verbose=False):
self.pdfobject = pdfobject
self.fill = fill
@ -169,6 +170,7 @@ class Lattice:
self.mtol = mtol
self.invert = invert
self.debug = debug
self.verbose = verbose
self.tables = {}
if self.debug is not None:
self.debug_images = {}
@ -184,6 +186,7 @@ class Lattice:
Dictionary with page number as key and list of tables on that
page as value.
"""
vprint = print if self.verbose else lambda *a, **k: None
self.pdfobject.split()
self.pdfobject.convert()
for page in self.pdfobject.extract():
@ -273,7 +276,7 @@ class Lattice:
ar = remove_empty(ar)
ar = [list(o) for o in ar]
page_tables.append(encode_list(ar))
print pkey # verbose
vprint(pkey)
self.tables[pkey] = page_tables
if self.debug is not None:

View File

@ -114,6 +114,8 @@ class Pdf:
def split(self):
"""Splits pdf into single page pdfs.
"""
if not self.pdfname.endswith('.pdf'):
raise TypeError("Only PDF format is supported.")
infile = PdfFileReader(open(self.pdfname, 'rb'), strict=False)
for p in self.pagenos:
page = infile.getPage(p - 1)

View File

@ -1,3 +1,4 @@
from __future__ import print_function
import os
import numpy as np
@ -105,13 +106,14 @@ class Stream:
"""
def __init__(self, pdfobject, ncolumns=0, columns=None, ytol=2,
debug=False):
debug=False, verbose=False):
self.pdfobject = pdfobject
self.ncolumns = ncolumns
self.columns = columns
self.ytol = ytol
self.debug = debug
self.verbose = verbose
self.tables = {}
if self.debug:
self.debug_text = {}
@ -125,6 +127,7 @@ class Stream:
Dictionary with page number as key and list of tables on that
page as value.
"""
vprint = print if self.verbose else lambda *a, **k: None
self.pdfobject.split()
for page in self.pdfobject.extract():
p, __, text, __, __ = page
@ -172,7 +175,7 @@ class Stream:
[ar[r_idx][c_idx], t.get_text().strip()])
else:
ar[r_idx][c_idx] = t.get_text().strip()
print pkey # verbose
vprint(pkey)
self.tables[pkey] = [encode_list(ar)]
if self.pdfobject.clean:

View File

@ -1,4 +1,5 @@
#!/usr/bin/env python2
from __future__ import print_function
import os
import sys
import time
@ -25,6 +26,7 @@ options:
Example: -p 1,3-6,10 [default: 1]
-f, --format <format> Output format. (csv,tsv,html,json,xlsx) [default: csv]
-l, --log Print log to file.
-V, --verbose Verbose.
-o, --output <directory> Output directory.
camelot methods:
@ -128,7 +130,7 @@ def write_to_disk(data, f='csv', output=None, filename=None):
[row for row in data[page][table]]})
save_data(os.path.join(output, '{}.xlsx'.format(froot)), xlsx_data)
except ImportError:
print "link to install docs"
print("link to install docs")
if __name__ == '__main__':
@ -141,6 +143,7 @@ if __name__ == '__main__':
elif args['<method>'] == 'stream':
args.update(docopt(stream_doc, argv=argv))
vprint = print if args['--verbose'] else lambda *a, **k: None
filename = args['<file>']
filedir = os.path.dirname(args['<file>'])
logname, __ = os.path.splitext(filename)
@ -178,7 +181,8 @@ if __name__ == '__main__':
jtol=int(args['--jtol']),
mtol=int(args['--mtol']),
invert=args['--invert'],
debug=args['--debug'])
debug=args['--debug'],
verbose=args['--verbose'])
data = extractor.get_tables()
if args['--debug']:
extractor.plot_geometry(args['--debug'])
@ -195,7 +199,8 @@ if __name__ == '__main__':
ncolumns=int(args['--ncols']),
columns=args['--columns'],
ytol=int(args['--ytol']),
debug=args['--debug'])
debug=args['--debug'],
verbose=args['--verbose'])
data = extractor.get_tables()
if args['--debug']:
extractor.plot_text()
@ -204,11 +209,11 @@ if __name__ == '__main__':
sys.exit()
if data is None:
print "See 'camelot <method> -h' for various parameters you can tweak."
print("See 'camelot <method> -h' for various parameters you can tweak.")
else:
output = filedir if args['--output'] is None else args['--output']
write_to_disk(data, f=args['--format'],
output=output, filename=filename)
print "finished in", time.time() - start_time, "seconds"
vprint("finished in", time.time() - start_time, "seconds")
logging.info("Time taken: " + str(time.time() - start_time) + " seconds")