Add verbose

pull/2/head
Vinayak Mehta 2016-08-03 03:40:53 +05:30
parent 57917426e8
commit 13568865b5
4 changed files with 22 additions and 9 deletions

View File

@ -1,3 +1,4 @@
from __future__ import print_function
import os import os
import cv2 import cv2
@ -160,7 +161,7 @@ class Lattice:
""" """
def __init__(self, pdfobject, fill=None, scale=15, jtol=2, mtol=2, def __init__(self, pdfobject, fill=None, scale=15, jtol=2, mtol=2,
invert=False, debug=None): invert=False, debug=None, verbose=False):
self.pdfobject = pdfobject self.pdfobject = pdfobject
self.fill = fill self.fill = fill
@ -169,6 +170,7 @@ class Lattice:
self.mtol = mtol self.mtol = mtol
self.invert = invert self.invert = invert
self.debug = debug self.debug = debug
self.verbose = verbose
self.tables = {} self.tables = {}
if self.debug is not None: if self.debug is not None:
self.debug_images = {} self.debug_images = {}
@ -184,6 +186,7 @@ class Lattice:
Dictionary with page number as key and list of tables on that Dictionary with page number as key and list of tables on that
page as value. page as value.
""" """
vprint = print if self.verbose else lambda *a, **k: None
self.pdfobject.split() self.pdfobject.split()
self.pdfobject.convert() self.pdfobject.convert()
for page in self.pdfobject.extract(): for page in self.pdfobject.extract():
@ -273,7 +276,7 @@ class Lattice:
ar = remove_empty(ar) ar = remove_empty(ar)
ar = [list(o) for o in ar] ar = [list(o) for o in ar]
page_tables.append(encode_list(ar)) page_tables.append(encode_list(ar))
print pkey # verbose vprint(pkey)
self.tables[pkey] = page_tables self.tables[pkey] = page_tables
if self.debug is not None: if self.debug is not None:

View File

@ -114,6 +114,8 @@ class Pdf:
def split(self): def split(self):
"""Splits pdf into single page pdfs. """Splits pdf into single page pdfs.
""" """
if not self.pdfname.endswith('.pdf'):
raise TypeError("Only PDF format is supported.")
infile = PdfFileReader(open(self.pdfname, 'rb'), strict=False) infile = PdfFileReader(open(self.pdfname, 'rb'), strict=False)
for p in self.pagenos: for p in self.pagenos:
page = infile.getPage(p - 1) page = infile.getPage(p - 1)

View File

@ -1,3 +1,4 @@
from __future__ import print_function
import os import os
import numpy as np import numpy as np
@ -105,13 +106,14 @@ class Stream:
""" """
def __init__(self, pdfobject, ncolumns=0, columns=None, ytol=2, def __init__(self, pdfobject, ncolumns=0, columns=None, ytol=2,
debug=False): debug=False, verbose=False):
self.pdfobject = pdfobject self.pdfobject = pdfobject
self.ncolumns = ncolumns self.ncolumns = ncolumns
self.columns = columns self.columns = columns
self.ytol = ytol self.ytol = ytol
self.debug = debug self.debug = debug
self.verbose = verbose
self.tables = {} self.tables = {}
if self.debug: if self.debug:
self.debug_text = {} self.debug_text = {}
@ -125,6 +127,7 @@ class Stream:
Dictionary with page number as key and list of tables on that Dictionary with page number as key and list of tables on that
page as value. page as value.
""" """
vprint = print if self.verbose else lambda *a, **k: None
self.pdfobject.split() self.pdfobject.split()
for page in self.pdfobject.extract(): for page in self.pdfobject.extract():
p, __, text, __, __ = page p, __, text, __, __ = page
@ -172,7 +175,7 @@ class Stream:
[ar[r_idx][c_idx], t.get_text().strip()]) [ar[r_idx][c_idx], t.get_text().strip()])
else: else:
ar[r_idx][c_idx] = t.get_text().strip() ar[r_idx][c_idx] = t.get_text().strip()
print pkey # verbose vprint(pkey)
self.tables[pkey] = [encode_list(ar)] self.tables[pkey] = [encode_list(ar)]
if self.pdfobject.clean: if self.pdfobject.clean:

View File

@ -1,4 +1,5 @@
#!/usr/bin/env python2 #!/usr/bin/env python2
from __future__ import print_function
import os import os
import sys import sys
import time import time
@ -25,6 +26,7 @@ options:
Example: -p 1,3-6,10 [default: 1] Example: -p 1,3-6,10 [default: 1]
-f, --format <format> Output format. (csv,tsv,html,json,xlsx) [default: csv] -f, --format <format> Output format. (csv,tsv,html,json,xlsx) [default: csv]
-l, --log Print log to file. -l, --log Print log to file.
-V, --verbose Verbose.
-o, --output <directory> Output directory. -o, --output <directory> Output directory.
camelot methods: camelot methods:
@ -128,7 +130,7 @@ def write_to_disk(data, f='csv', output=None, filename=None):
[row for row in data[page][table]]}) [row for row in data[page][table]]})
save_data(os.path.join(output, '{}.xlsx'.format(froot)), xlsx_data) save_data(os.path.join(output, '{}.xlsx'.format(froot)), xlsx_data)
except ImportError: except ImportError:
print "link to install docs" print("link to install docs")
if __name__ == '__main__': if __name__ == '__main__':
@ -141,6 +143,7 @@ if __name__ == '__main__':
elif args['<method>'] == 'stream': elif args['<method>'] == 'stream':
args.update(docopt(stream_doc, argv=argv)) args.update(docopt(stream_doc, argv=argv))
vprint = print if args['--verbose'] else lambda *a, **k: None
filename = args['<file>'] filename = args['<file>']
filedir = os.path.dirname(args['<file>']) filedir = os.path.dirname(args['<file>'])
logname, __ = os.path.splitext(filename) logname, __ = os.path.splitext(filename)
@ -178,7 +181,8 @@ if __name__ == '__main__':
jtol=int(args['--jtol']), jtol=int(args['--jtol']),
mtol=int(args['--mtol']), mtol=int(args['--mtol']),
invert=args['--invert'], invert=args['--invert'],
debug=args['--debug']) debug=args['--debug'],
verbose=args['--verbose'])
data = extractor.get_tables() data = extractor.get_tables()
if args['--debug']: if args['--debug']:
extractor.plot_geometry(args['--debug']) extractor.plot_geometry(args['--debug'])
@ -195,7 +199,8 @@ if __name__ == '__main__':
ncolumns=int(args['--ncols']), ncolumns=int(args['--ncols']),
columns=args['--columns'], columns=args['--columns'],
ytol=int(args['--ytol']), ytol=int(args['--ytol']),
debug=args['--debug']) debug=args['--debug'],
verbose=args['--verbose'])
data = extractor.get_tables() data = extractor.get_tables()
if args['--debug']: if args['--debug']:
extractor.plot_text() extractor.plot_text()
@ -204,11 +209,11 @@ if __name__ == '__main__':
sys.exit() sys.exit()
if data is None: if data is None:
print "See 'camelot <method> -h' for various parameters you can tweak." print("See 'camelot <method> -h' for various parameters you can tweak.")
else: else:
output = filedir if args['--output'] is None else args['--output'] output = filedir if args['--output'] is None else args['--output']
write_to_disk(data, f=args['--format'], write_to_disk(data, f=args['--format'],
output=output, filename=filename) output=output, filename=filename)
print "finished in", time.time() - start_time, "seconds" vprint("finished in", time.time() - start_time, "seconds")
logging.info("Time taken: " + str(time.time() - start_time) + " seconds") logging.info("Time taken: " + str(time.time() - start_time) + " seconds")