Add verbose
parent
57917426e8
commit
13568865b5
|
|
@ -1,3 +1,4 @@
|
||||||
|
from __future__ import print_function
|
||||||
import os
|
import os
|
||||||
|
|
||||||
import cv2
|
import cv2
|
||||||
|
|
@ -160,7 +161,7 @@ class Lattice:
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, pdfobject, fill=None, scale=15, jtol=2, mtol=2,
|
def __init__(self, pdfobject, fill=None, scale=15, jtol=2, mtol=2,
|
||||||
invert=False, debug=None):
|
invert=False, debug=None, verbose=False):
|
||||||
|
|
||||||
self.pdfobject = pdfobject
|
self.pdfobject = pdfobject
|
||||||
self.fill = fill
|
self.fill = fill
|
||||||
|
|
@ -169,6 +170,7 @@ class Lattice:
|
||||||
self.mtol = mtol
|
self.mtol = mtol
|
||||||
self.invert = invert
|
self.invert = invert
|
||||||
self.debug = debug
|
self.debug = debug
|
||||||
|
self.verbose = verbose
|
||||||
self.tables = {}
|
self.tables = {}
|
||||||
if self.debug is not None:
|
if self.debug is not None:
|
||||||
self.debug_images = {}
|
self.debug_images = {}
|
||||||
|
|
@ -184,6 +186,7 @@ class Lattice:
|
||||||
Dictionary with page number as key and list of tables on that
|
Dictionary with page number as key and list of tables on that
|
||||||
page as value.
|
page as value.
|
||||||
"""
|
"""
|
||||||
|
vprint = print if self.verbose else lambda *a, **k: None
|
||||||
self.pdfobject.split()
|
self.pdfobject.split()
|
||||||
self.pdfobject.convert()
|
self.pdfobject.convert()
|
||||||
for page in self.pdfobject.extract():
|
for page in self.pdfobject.extract():
|
||||||
|
|
@ -273,7 +276,7 @@ class Lattice:
|
||||||
ar = remove_empty(ar)
|
ar = remove_empty(ar)
|
||||||
ar = [list(o) for o in ar]
|
ar = [list(o) for o in ar]
|
||||||
page_tables.append(encode_list(ar))
|
page_tables.append(encode_list(ar))
|
||||||
print pkey # verbose
|
vprint(pkey)
|
||||||
self.tables[pkey] = page_tables
|
self.tables[pkey] = page_tables
|
||||||
|
|
||||||
if self.debug is not None:
|
if self.debug is not None:
|
||||||
|
|
|
||||||
|
|
@ -114,6 +114,8 @@ class Pdf:
|
||||||
def split(self):
|
def split(self):
|
||||||
"""Splits pdf into single page pdfs.
|
"""Splits pdf into single page pdfs.
|
||||||
"""
|
"""
|
||||||
|
if not self.pdfname.endswith('.pdf'):
|
||||||
|
raise TypeError("Only PDF format is supported.")
|
||||||
infile = PdfFileReader(open(self.pdfname, 'rb'), strict=False)
|
infile = PdfFileReader(open(self.pdfname, 'rb'), strict=False)
|
||||||
for p in self.pagenos:
|
for p in self.pagenos:
|
||||||
page = infile.getPage(p - 1)
|
page = infile.getPage(p - 1)
|
||||||
|
|
|
||||||
|
|
@ -1,3 +1,4 @@
|
||||||
|
from __future__ import print_function
|
||||||
import os
|
import os
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
@ -105,13 +106,14 @@ class Stream:
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, pdfobject, ncolumns=0, columns=None, ytol=2,
|
def __init__(self, pdfobject, ncolumns=0, columns=None, ytol=2,
|
||||||
debug=False):
|
debug=False, verbose=False):
|
||||||
|
|
||||||
self.pdfobject = pdfobject
|
self.pdfobject = pdfobject
|
||||||
self.ncolumns = ncolumns
|
self.ncolumns = ncolumns
|
||||||
self.columns = columns
|
self.columns = columns
|
||||||
self.ytol = ytol
|
self.ytol = ytol
|
||||||
self.debug = debug
|
self.debug = debug
|
||||||
|
self.verbose = verbose
|
||||||
self.tables = {}
|
self.tables = {}
|
||||||
if self.debug:
|
if self.debug:
|
||||||
self.debug_text = {}
|
self.debug_text = {}
|
||||||
|
|
@ -125,6 +127,7 @@ class Stream:
|
||||||
Dictionary with page number as key and list of tables on that
|
Dictionary with page number as key and list of tables on that
|
||||||
page as value.
|
page as value.
|
||||||
"""
|
"""
|
||||||
|
vprint = print if self.verbose else lambda *a, **k: None
|
||||||
self.pdfobject.split()
|
self.pdfobject.split()
|
||||||
for page in self.pdfobject.extract():
|
for page in self.pdfobject.extract():
|
||||||
p, __, text, __, __ = page
|
p, __, text, __, __ = page
|
||||||
|
|
@ -172,7 +175,7 @@ class Stream:
|
||||||
[ar[r_idx][c_idx], t.get_text().strip()])
|
[ar[r_idx][c_idx], t.get_text().strip()])
|
||||||
else:
|
else:
|
||||||
ar[r_idx][c_idx] = t.get_text().strip()
|
ar[r_idx][c_idx] = t.get_text().strip()
|
||||||
print pkey # verbose
|
vprint(pkey)
|
||||||
self.tables[pkey] = [encode_list(ar)]
|
self.tables[pkey] = [encode_list(ar)]
|
||||||
|
|
||||||
if self.pdfobject.clean:
|
if self.pdfobject.clean:
|
||||||
|
|
|
||||||
|
|
@ -1,4 +1,5 @@
|
||||||
#!/usr/bin/env python2
|
#!/usr/bin/env python2
|
||||||
|
from __future__ import print_function
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
import time
|
import time
|
||||||
|
|
@ -25,6 +26,7 @@ options:
|
||||||
Example: -p 1,3-6,10 [default: 1]
|
Example: -p 1,3-6,10 [default: 1]
|
||||||
-f, --format <format> Output format. (csv,tsv,html,json,xlsx) [default: csv]
|
-f, --format <format> Output format. (csv,tsv,html,json,xlsx) [default: csv]
|
||||||
-l, --log Print log to file.
|
-l, --log Print log to file.
|
||||||
|
-V, --verbose Verbose.
|
||||||
-o, --output <directory> Output directory.
|
-o, --output <directory> Output directory.
|
||||||
|
|
||||||
camelot methods:
|
camelot methods:
|
||||||
|
|
@ -128,7 +130,7 @@ def write_to_disk(data, f='csv', output=None, filename=None):
|
||||||
[row for row in data[page][table]]})
|
[row for row in data[page][table]]})
|
||||||
save_data(os.path.join(output, '{}.xlsx'.format(froot)), xlsx_data)
|
save_data(os.path.join(output, '{}.xlsx'.format(froot)), xlsx_data)
|
||||||
except ImportError:
|
except ImportError:
|
||||||
print "link to install docs"
|
print("link to install docs")
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
|
@ -141,6 +143,7 @@ if __name__ == '__main__':
|
||||||
elif args['<method>'] == 'stream':
|
elif args['<method>'] == 'stream':
|
||||||
args.update(docopt(stream_doc, argv=argv))
|
args.update(docopt(stream_doc, argv=argv))
|
||||||
|
|
||||||
|
vprint = print if args['--verbose'] else lambda *a, **k: None
|
||||||
filename = args['<file>']
|
filename = args['<file>']
|
||||||
filedir = os.path.dirname(args['<file>'])
|
filedir = os.path.dirname(args['<file>'])
|
||||||
logname, __ = os.path.splitext(filename)
|
logname, __ = os.path.splitext(filename)
|
||||||
|
|
@ -178,7 +181,8 @@ if __name__ == '__main__':
|
||||||
jtol=int(args['--jtol']),
|
jtol=int(args['--jtol']),
|
||||||
mtol=int(args['--mtol']),
|
mtol=int(args['--mtol']),
|
||||||
invert=args['--invert'],
|
invert=args['--invert'],
|
||||||
debug=args['--debug'])
|
debug=args['--debug'],
|
||||||
|
verbose=args['--verbose'])
|
||||||
data = extractor.get_tables()
|
data = extractor.get_tables()
|
||||||
if args['--debug']:
|
if args['--debug']:
|
||||||
extractor.plot_geometry(args['--debug'])
|
extractor.plot_geometry(args['--debug'])
|
||||||
|
|
@ -195,7 +199,8 @@ if __name__ == '__main__':
|
||||||
ncolumns=int(args['--ncols']),
|
ncolumns=int(args['--ncols']),
|
||||||
columns=args['--columns'],
|
columns=args['--columns'],
|
||||||
ytol=int(args['--ytol']),
|
ytol=int(args['--ytol']),
|
||||||
debug=args['--debug'])
|
debug=args['--debug'],
|
||||||
|
verbose=args['--verbose'])
|
||||||
data = extractor.get_tables()
|
data = extractor.get_tables()
|
||||||
if args['--debug']:
|
if args['--debug']:
|
||||||
extractor.plot_text()
|
extractor.plot_text()
|
||||||
|
|
@ -204,11 +209,11 @@ if __name__ == '__main__':
|
||||||
sys.exit()
|
sys.exit()
|
||||||
|
|
||||||
if data is None:
|
if data is None:
|
||||||
print "See 'camelot <method> -h' for various parameters you can tweak."
|
print("See 'camelot <method> -h' for various parameters you can tweak.")
|
||||||
else:
|
else:
|
||||||
output = filedir if args['--output'] is None else args['--output']
|
output = filedir if args['--output'] is None else args['--output']
|
||||||
write_to_disk(data, f=args['--format'],
|
write_to_disk(data, f=args['--format'],
|
||||||
output=output, filename=filename)
|
output=output, filename=filename)
|
||||||
|
|
||||||
print "finished in", time.time() - start_time, "seconds"
|
vprint("finished in", time.time() - start_time, "seconds")
|
||||||
logging.info("Time taken: " + str(time.time() - start_time) + " seconds")
|
logging.info("Time taken: " + str(time.time() - start_time) + " seconds")
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue