diff --git a/camelot/lattice.py b/camelot/lattice.py index f779d69..9f0b419 100644 --- a/camelot/lattice.py +++ b/camelot/lattice.py @@ -1,3 +1,4 @@ +from __future__ import print_function import os import cv2 @@ -160,7 +161,7 @@ class Lattice: """ def __init__(self, pdfobject, fill=None, scale=15, jtol=2, mtol=2, - invert=False, debug=None): + invert=False, debug=None, verbose=False): self.pdfobject = pdfobject self.fill = fill @@ -169,6 +170,7 @@ class Lattice: self.mtol = mtol self.invert = invert self.debug = debug + self.verbose = verbose self.tables = {} if self.debug is not None: self.debug_images = {} @@ -184,6 +186,7 @@ class Lattice: Dictionary with page number as key and list of tables on that page as value. """ + vprint = print if self.verbose else lambda *a, **k: None self.pdfobject.split() self.pdfobject.convert() for page in self.pdfobject.extract(): @@ -273,7 +276,7 @@ class Lattice: ar = remove_empty(ar) ar = [list(o) for o in ar] page_tables.append(encode_list(ar)) - print pkey # verbose + vprint(pkey) self.tables[pkey] = page_tables if self.debug is not None: diff --git a/camelot/pdf.py b/camelot/pdf.py index eb2e08e..ce8783c 100644 --- a/camelot/pdf.py +++ b/camelot/pdf.py @@ -114,6 +114,8 @@ class Pdf: def split(self): """Splits pdf into single page pdfs. """ + if not self.pdfname.endswith('.pdf'): + raise TypeError("Only PDF format is supported.") infile = PdfFileReader(open(self.pdfname, 'rb'), strict=False) for p in self.pagenos: page = infile.getPage(p - 1) diff --git a/camelot/stream.py b/camelot/stream.py index 5139537..7bb09ae 100644 --- a/camelot/stream.py +++ b/camelot/stream.py @@ -1,3 +1,4 @@ +from __future__ import print_function import os import numpy as np @@ -105,13 +106,14 @@ class Stream: """ def __init__(self, pdfobject, ncolumns=0, columns=None, ytol=2, - debug=False): + debug=False, verbose=False): self.pdfobject = pdfobject self.ncolumns = ncolumns self.columns = columns self.ytol = ytol self.debug = debug + self.verbose = verbose self.tables = {} if self.debug: self.debug_text = {} @@ -125,6 +127,7 @@ class Stream: Dictionary with page number as key and list of tables on that page as value. """ + vprint = print if self.verbose else lambda *a, **k: None self.pdfobject.split() for page in self.pdfobject.extract(): p, __, text, __, __ = page @@ -172,7 +175,7 @@ class Stream: [ar[r_idx][c_idx], t.get_text().strip()]) else: ar[r_idx][c_idx] = t.get_text().strip() - print pkey # verbose + vprint(pkey) self.tables[pkey] = [encode_list(ar)] if self.pdfobject.clean: diff --git a/tools/camelot b/tools/camelot index 439112e..0d1d36a 100755 --- a/tools/camelot +++ b/tools/camelot @@ -1,4 +1,5 @@ #!/usr/bin/env python2 +from __future__ import print_function import os import sys import time @@ -25,6 +26,7 @@ options: Example: -p 1,3-6,10 [default: 1] -f, --format Output format. (csv,tsv,html,json,xlsx) [default: csv] -l, --log Print log to file. + -V, --verbose Verbose. -o, --output Output directory. camelot methods: @@ -128,7 +130,7 @@ def write_to_disk(data, f='csv', output=None, filename=None): [row for row in data[page][table]]}) save_data(os.path.join(output, '{}.xlsx'.format(froot)), xlsx_data) except ImportError: - print "link to install docs" + print("link to install docs") if __name__ == '__main__': @@ -141,6 +143,7 @@ if __name__ == '__main__': elif args[''] == 'stream': args.update(docopt(stream_doc, argv=argv)) + vprint = print if args['--verbose'] else lambda *a, **k: None filename = args[''] filedir = os.path.dirname(args['']) logname, __ = os.path.splitext(filename) @@ -178,7 +181,8 @@ if __name__ == '__main__': jtol=int(args['--jtol']), mtol=int(args['--mtol']), invert=args['--invert'], - debug=args['--debug']) + debug=args['--debug'], + verbose=args['--verbose']) data = extractor.get_tables() if args['--debug']: extractor.plot_geometry(args['--debug']) @@ -195,7 +199,8 @@ if __name__ == '__main__': ncolumns=int(args['--ncols']), columns=args['--columns'], ytol=int(args['--ytol']), - debug=args['--debug']) + debug=args['--debug'], + verbose=args['--verbose']) data = extractor.get_tables() if args['--debug']: extractor.plot_text() @@ -204,11 +209,11 @@ if __name__ == '__main__': sys.exit() if data is None: - print "See 'camelot -h' for various parameters you can tweak." + print("See 'camelot -h' for various parameters you can tweak.") else: output = filedir if args['--output'] is None else args['--output'] write_to_disk(data, f=args['--format'], output=output, filename=filename) - print "finished in", time.time() - start_time, "seconds" + vprint("finished in", time.time() - start_time, "seconds") logging.info("Time taken: " + str(time.time() - start_time) + " seconds")