Add new params
parent
b01edee337
commit
5c6a74fb2a
|
|
@ -277,9 +277,9 @@ class Stream:
|
||||||
self.table_area = table_area
|
self.table_area = table_area
|
||||||
self.columns = columns
|
self.columns = columns
|
||||||
self.ncolumns = ncolumns
|
self.ncolumns = ncolumns
|
||||||
|
self.headers = headers
|
||||||
self.ytol = ytol
|
self.ytol = ytol
|
||||||
self.mtol = mtol
|
self.mtol = mtol
|
||||||
self.headers = headers
|
|
||||||
self.char_margin, self.line_margin, self.word_margin = margins
|
self.char_margin, self.line_margin, self.word_margin = margins
|
||||||
self.split_text = split_text
|
self.split_text = split_text
|
||||||
self.flag_size = flag_size
|
self.flag_size = flag_size
|
||||||
|
|
|
||||||
|
|
@ -2,8 +2,10 @@
|
||||||
from __future__ import print_function
|
from __future__ import print_function
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
|
import glob
|
||||||
import time
|
import time
|
||||||
import logging
|
import logging
|
||||||
|
import zipfile
|
||||||
import warnings
|
import warnings
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
@ -40,9 +42,12 @@ options:
|
||||||
-W, --wmargin <wmargin> Word margin. Insert blank spaces between chars
|
-W, --wmargin <wmargin> Word margin. Insert blank spaces between chars
|
||||||
if distance between words is greater than word
|
if distance between words is greater than word
|
||||||
margin. [default: 0.1]
|
margin. [default: 0.1]
|
||||||
-S, --print-stats List stats on the parsing process.
|
-J, --split_text Split text lines if they span across multiple cells.
|
||||||
-T, --save-stats Save stats to a file.
|
-K, --flag_size Flag substring if its size differs from the whole string.
|
||||||
-X, --plot <dist> Plot distributions. (page,all,rc)
|
Useful for super and subscripts.
|
||||||
|
-X, --print-stats List stats on the parsing process.
|
||||||
|
-Y, --save-stats Save stats to a file.
|
||||||
|
-Z, --plot <dist> Plot distributions. (page,all,rc)
|
||||||
|
|
||||||
camelot methods:
|
camelot methods:
|
||||||
lattice Looks for lines between data.
|
lattice Looks for lines between data.
|
||||||
|
|
@ -55,19 +60,22 @@ lattice_doc = """
|
||||||
Lattice method looks for lines between text to form a table.
|
Lattice method looks for lines between text to form a table.
|
||||||
|
|
||||||
usage:
|
usage:
|
||||||
camelot lattice [-t <tarea>...] [-F <fill>...]
|
camelot lattice [-t <tarea>...] [-F <fill>...] [-H <header>...]
|
||||||
[-m <mtol>...] [options] [--] <file>
|
[-m <mtol>...] [options] [--] <file>
|
||||||
|
|
||||||
options:
|
options:
|
||||||
-t, --tarea <tarea> Specific table areas to analyze.
|
-t, --tarea <tarea> Specific table areas to analyze.
|
||||||
-F, --fill <fill> Fill data in horizontal and/or vertical spanning
|
-F, --fill <fill> Fill data in horizontal and/or vertical spanning
|
||||||
cells. Example: -F h, -F v, -F hv
|
cells. Example: -F h, -F v, -F hv
|
||||||
|
-H, --header <header> Specify header for each table.
|
||||||
-m, --mtol <mtol> Tolerance to account for when merging lines
|
-m, --mtol <mtol> Tolerance to account for when merging lines
|
||||||
which are very close. [default: 2]
|
which are very close. [default: 2]
|
||||||
-s, --scale <scale> Scaling factor. Large scaling factor leads to
|
-s, --scale <scale> Scaling factor. Large scaling factor leads to
|
||||||
smaller lines being detected. [default: 15]
|
smaller lines being detected. [default: 15]
|
||||||
-i, --invert Invert pdf image to make sure that lines are
|
-i, --invert Invert pdf image to make sure that lines are
|
||||||
in foreground.
|
in foreground.
|
||||||
|
-T, --shift_text <shift_text> Specify where the text in a spanning cell
|
||||||
|
should flow, order-sensitive. [default: lt]
|
||||||
-d, --debug <debug> Debug by visualizing pdf geometry.
|
-d, --debug <debug> Debug by visualizing pdf geometry.
|
||||||
(contour,line,joint,table) Example: -d table
|
(contour,line,joint,table) Example: -d table
|
||||||
"""
|
"""
|
||||||
|
|
@ -76,14 +84,15 @@ stream_doc = """
|
||||||
Stream method looks for whitespaces between text to form a table.
|
Stream method looks for whitespaces between text to form a table.
|
||||||
|
|
||||||
usage:
|
usage:
|
||||||
camelot stream [-t <tarea>...] [-c <columns>...] [-n <ncols>...] [-y <ytol>...]
|
camelot stream [-t <tarea>...] [-c <columns>...] [-n <ncols>...] [-H <header>...]
|
||||||
[-m <mtol>...] [options] [--] <file>
|
[-y <ytol>...] [-m <mtol>...] [options] [--] <file>
|
||||||
|
|
||||||
options:
|
options:
|
||||||
-t, --tarea <tarea> Specific table areas to analyze.
|
-t, --tarea <tarea> Specific table areas to analyze.
|
||||||
-c, --columns <columns> Comma-separated list of column x-coordinates.
|
-c, --columns <columns> Comma-separated list of column x-coordinates.
|
||||||
Example: -c 10.1,20.2,30.3
|
Example: -c 10.1,20.2,30.3
|
||||||
-n, --ncols <ncols> Number of columns. [default: -1]
|
-n, --ncols <ncols> Number of columns. [default: -1]
|
||||||
|
-H, --header <header> Specify header for each table.
|
||||||
-y, --ytol <ytol> Tolerance to account for when grouping rows
|
-y, --ytol <ytol> Tolerance to account for when grouping rows
|
||||||
together. [default: 2]
|
together. [default: 2]
|
||||||
-m, --mtol <mtol> Tolerance to account for when merging columns
|
-m, --mtol <mtol> Tolerance to account for when merging columns
|
||||||
|
|
@ -266,6 +275,11 @@ def write_to_disk(data, f='csv', output=None, filename=None):
|
||||||
outfile, delimiter=delimiter, quoting=csv.QUOTE_ALL)
|
outfile, delimiter=delimiter, quoting=csv.QUOTE_ALL)
|
||||||
for row in data[page_number][table_number]['data']:
|
for row in data[page_number][table_number]['data']:
|
||||||
writer.writerow(row)
|
writer.writerow(row)
|
||||||
|
csv_glob = glob.glob(os.path.join(output, '*.csv'))
|
||||||
|
if len(csv_glob) > 1:
|
||||||
|
with zipfile.ZipFile(os.path.join(output, '{0}.zip'.format(froot)), 'w') as zfile:
|
||||||
|
for cfile in csv_glob:
|
||||||
|
zfile.write(cfile, os.path.basename(cfile), zipfile.ZIP_DEFLATED)
|
||||||
elif f == 'html':
|
elif f == 'html':
|
||||||
htmlname = '{0}.html'.format(froot)
|
htmlname = '{0}.html'.format(froot)
|
||||||
for page_number in sorted(data.keys()):
|
for page_number in sorted(data.keys()):
|
||||||
|
|
@ -339,11 +353,14 @@ if __name__ == '__main__':
|
||||||
try:
|
try:
|
||||||
tarea = args['--tarea'] if args['--tarea'] else None
|
tarea = args['--tarea'] if args['--tarea'] else None
|
||||||
fill = args['--fill'] if args['--fill'] else None
|
fill = args['--fill'] if args['--fill'] else None
|
||||||
|
header = args['--header'] if args['--header'] else None
|
||||||
mtol = [int(m) for m in args['--mtol']]
|
mtol = [int(m) for m in args['--mtol']]
|
||||||
manager = Pdf(Lattice(table_area=tarea, fill=fill,
|
shift_text = args['--shift_text'].split(',') if args['--shift_text'] else ['l', 't']
|
||||||
|
manager = Pdf(Lattice(table_area=tarea, fill=fill, headers=header,
|
||||||
mtol=mtol, scale=int(args['--scale']),
|
mtol=mtol, scale=int(args['--scale']),
|
||||||
invert=args['--invert'], margins=margins,
|
invert=args['--invert'], margins=margins,
|
||||||
debug=args['--debug']),
|
split_text=args['--split_text'], flag_size=['--flag_size'],
|
||||||
|
shift_text=shift_text, debug=args['--debug']),
|
||||||
filename,
|
filename,
|
||||||
pagenos=p,
|
pagenos=p,
|
||||||
parallel=args['--parallel'],
|
parallel=args['--parallel'],
|
||||||
|
|
@ -408,11 +425,13 @@ if __name__ == '__main__':
|
||||||
ncolumns = [int(nc) for nc in args['--ncols']]
|
ncolumns = [int(nc) for nc in args['--ncols']]
|
||||||
else:
|
else:
|
||||||
ncolumns = None
|
ncolumns = None
|
||||||
|
header = args['--header'] if args['--header'] else None
|
||||||
ytol = [int(y) for y in args['--ytol']]
|
ytol = [int(y) for y in args['--ytol']]
|
||||||
mtol = [int(m) for m in args['--mtol']]
|
mtol = [int(m) for m in args['--mtol']]
|
||||||
manager = Pdf(Stream(table_area=tarea, columns=columns,
|
manager = Pdf(Stream(table_area=tarea, columns=columns,
|
||||||
ncolumns=ncolumns, ytol=ytol, mtol=mtol,
|
ncolumns=ncolumns, headers=header, ytol=ytol,
|
||||||
margins=margins, debug=args['--debug']),
|
mtol=mtol, margins=margins, split_text=args['--split_text'],
|
||||||
|
flag_size=['--flag_size'], debug=args['--debug']),
|
||||||
filename,
|
filename,
|
||||||
pagenos=p,
|
pagenos=p,
|
||||||
parallel=args['--parallel'],
|
parallel=args['--parallel'],
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue