Add new params

pull/2/head
Vinayak Mehta 2016-10-18 18:23:35 +05:30 committed by GitHub
parent b01edee337
commit 5c6a74fb2a
2 changed files with 41 additions and 22 deletions

View File

@ -277,9 +277,9 @@ class Stream:
self.table_area = table_area self.table_area = table_area
self.columns = columns self.columns = columns
self.ncolumns = ncolumns self.ncolumns = ncolumns
self.headers = headers
self.ytol = ytol self.ytol = ytol
self.mtol = mtol self.mtol = mtol
self.headers = headers
self.char_margin, self.line_margin, self.word_margin = margins self.char_margin, self.line_margin, self.word_margin = margins
self.split_text = split_text self.split_text = split_text
self.flag_size = flag_size self.flag_size = flag_size

View File

@ -2,8 +2,10 @@
from __future__ import print_function from __future__ import print_function
import os import os
import sys import sys
import glob
import time import time
import logging import logging
import zipfile
import warnings import warnings
import numpy as np import numpy as np
@ -40,9 +42,12 @@ options:
-W, --wmargin <wmargin> Word margin. Insert blank spaces between chars -W, --wmargin <wmargin> Word margin. Insert blank spaces between chars
if distance between words is greater than word if distance between words is greater than word
margin. [default: 0.1] margin. [default: 0.1]
-S, --print-stats List stats on the parsing process. -J, --split_text Split text lines if they span across multiple cells.
-T, --save-stats Save stats to a file. -K, --flag_size Flag substring if its size differs from the whole string.
-X, --plot <dist> Plot distributions. (page,all,rc) Useful for super and subscripts.
-X, --print-stats List stats on the parsing process.
-Y, --save-stats Save stats to a file.
-Z, --plot <dist> Plot distributions. (page,all,rc)
camelot methods: camelot methods:
lattice Looks for lines between data. lattice Looks for lines between data.
@ -55,19 +60,22 @@ lattice_doc = """
Lattice method looks for lines between text to form a table. Lattice method looks for lines between text to form a table.
usage: usage:
camelot lattice [-t <tarea>...] [-F <fill>...] camelot lattice [-t <tarea>...] [-F <fill>...] [-H <header>...]
[-m <mtol>...] [options] [--] <file> [-m <mtol>...] [options] [--] <file>
options: options:
-t, --tarea <tarea> Specific table areas to analyze. -t, --tarea <tarea> Specific table areas to analyze.
-F, --fill <fill> Fill data in horizontal and/or vertical spanning -F, --fill <fill> Fill data in horizontal and/or vertical spanning
cells. Example: -F h, -F v, -F hv cells. Example: -F h, -F v, -F hv
-H, --header <header> Specify header for each table.
-m, --mtol <mtol> Tolerance to account for when merging lines -m, --mtol <mtol> Tolerance to account for when merging lines
which are very close. [default: 2] which are very close. [default: 2]
-s, --scale <scale> Scaling factor. Large scaling factor leads to -s, --scale <scale> Scaling factor. Large scaling factor leads to
smaller lines being detected. [default: 15] smaller lines being detected. [default: 15]
-i, --invert Invert pdf image to make sure that lines are -i, --invert Invert pdf image to make sure that lines are
in foreground. in foreground.
-T, --shift_text <shift_text> Specify where the text in a spanning cell
should flow, order-sensitive. [default: lt]
-d, --debug <debug> Debug by visualizing pdf geometry. -d, --debug <debug> Debug by visualizing pdf geometry.
(contour,line,joint,table) Example: -d table (contour,line,joint,table) Example: -d table
""" """
@ -76,14 +84,15 @@ stream_doc = """
Stream method looks for whitespaces between text to form a table. Stream method looks for whitespaces between text to form a table.
usage: usage:
camelot stream [-t <tarea>...] [-c <columns>...] [-n <ncols>...] [-y <ytol>...] camelot stream [-t <tarea>...] [-c <columns>...] [-n <ncols>...] [-H <header>...]
[-m <mtol>...] [options] [--] <file> [-y <ytol>...] [-m <mtol>...] [options] [--] <file>
options: options:
-t, --tarea <tarea> Specific table areas to analyze. -t, --tarea <tarea> Specific table areas to analyze.
-c, --columns <columns> Comma-separated list of column x-coordinates. -c, --columns <columns> Comma-separated list of column x-coordinates.
Example: -c 10.1,20.2,30.3 Example: -c 10.1,20.2,30.3
-n, --ncols <ncols> Number of columns. [default: -1] -n, --ncols <ncols> Number of columns. [default: -1]
-H, --header <header> Specify header for each table.
-y, --ytol <ytol> Tolerance to account for when grouping rows -y, --ytol <ytol> Tolerance to account for when grouping rows
together. [default: 2] together. [default: 2]
-m, --mtol <mtol> Tolerance to account for when merging columns -m, --mtol <mtol> Tolerance to account for when merging columns
@ -266,6 +275,11 @@ def write_to_disk(data, f='csv', output=None, filename=None):
outfile, delimiter=delimiter, quoting=csv.QUOTE_ALL) outfile, delimiter=delimiter, quoting=csv.QUOTE_ALL)
for row in data[page_number][table_number]['data']: for row in data[page_number][table_number]['data']:
writer.writerow(row) writer.writerow(row)
csv_glob = glob.glob(os.path.join(output, '*.csv'))
if len(csv_glob) > 1:
with zipfile.ZipFile(os.path.join(output, '{0}.zip'.format(froot)), 'w') as zfile:
for cfile in csv_glob:
zfile.write(cfile, os.path.basename(cfile), zipfile.ZIP_DEFLATED)
elif f == 'html': elif f == 'html':
htmlname = '{0}.html'.format(froot) htmlname = '{0}.html'.format(froot)
for page_number in sorted(data.keys()): for page_number in sorted(data.keys()):
@ -339,11 +353,14 @@ if __name__ == '__main__':
try: try:
tarea = args['--tarea'] if args['--tarea'] else None tarea = args['--tarea'] if args['--tarea'] else None
fill = args['--fill'] if args['--fill'] else None fill = args['--fill'] if args['--fill'] else None
header = args['--header'] if args['--header'] else None
mtol = [int(m) for m in args['--mtol']] mtol = [int(m) for m in args['--mtol']]
manager = Pdf(Lattice(table_area=tarea, fill=fill, shift_text = args['--shift_text'].split(',') if args['--shift_text'] else ['l', 't']
manager = Pdf(Lattice(table_area=tarea, fill=fill, headers=header,
mtol=mtol, scale=int(args['--scale']), mtol=mtol, scale=int(args['--scale']),
invert=args['--invert'], margins=margins, invert=args['--invert'], margins=margins,
debug=args['--debug']), split_text=args['--split_text'], flag_size=['--flag_size'],
shift_text=shift_text, debug=args['--debug']),
filename, filename,
pagenos=p, pagenos=p,
parallel=args['--parallel'], parallel=args['--parallel'],
@ -408,11 +425,13 @@ if __name__ == '__main__':
ncolumns = [int(nc) for nc in args['--ncols']] ncolumns = [int(nc) for nc in args['--ncols']]
else: else:
ncolumns = None ncolumns = None
header = args['--header'] if args['--header'] else None
ytol = [int(y) for y in args['--ytol']] ytol = [int(y) for y in args['--ytol']]
mtol = [int(m) for m in args['--mtol']] mtol = [int(m) for m in args['--mtol']]
manager = Pdf(Stream(table_area=tarea, columns=columns, manager = Pdf(Stream(table_area=tarea, columns=columns,
ncolumns=ncolumns, ytol=ytol, mtol=mtol, ncolumns=ncolumns, headers=header, ytol=ytol,
margins=margins, debug=args['--debug']), mtol=mtol, margins=margins, split_text=args['--split_text'],
flag_size=['--flag_size'], debug=args['--debug']),
filename, filename,
pagenos=p, pagenos=p,
parallel=args['--parallel'], parallel=args['--parallel'],