Modify command line tool
Precompute globs Replace argparse with docopt Fix CLI Update .gitignore Add docstrings Update README Fix typo Replace zip subprocess call Use tempfile Fix newlinepull/2/head
parent
3045a92630
commit
271d4cafd6
|
|
@ -1,3 +1,5 @@
|
||||||
__pycache__/
|
__pycache__/
|
||||||
*.py[cod]
|
*.py[cod]
|
||||||
|
*.so
|
||||||
|
|
||||||
.camelot/
|
.camelot/
|
||||||
69
README.md
69
README.md
|
|
@ -14,63 +14,24 @@ camelot also uses poppler-utils, more specifically `pdfseparate` to separate a p
|
||||||
|
|
||||||
## Usage
|
## Usage
|
||||||
|
|
||||||
python2 camelot.py [options] file
|
<pre>
|
||||||
|
camelot.py [options] <method> [<args>...]
|
||||||
|
|
||||||
positional arguments:
|
options:
|
||||||
|
-h, --help Show this screen.
|
||||||
|
-v, --version Show version.
|
||||||
|
-p, --pages <pageno> Comma-separated list of page numbers.
|
||||||
|
Example: -p 1,3-6,10 [default: 1]
|
||||||
|
-f, --format <format> Output format. (csv,xlsx) [default: csv]
|
||||||
|
-l, --log Print log to file.
|
||||||
|
-o, --output <directory> Output directory.
|
||||||
|
|
||||||
file
|
camelot methods:
|
||||||
|
lattice Looks for lines between data.
|
||||||
|
stream Looks for spaces between data.
|
||||||
|
|
||||||
optional arguments:
|
See 'camelot <method> -h' for more information on a specific method.
|
||||||
|
</pre>
|
||||||
-h, --help
|
|
||||||
|
|
||||||
show this help message and exit
|
|
||||||
|
|
||||||
-p, --pages PAGES [PAGES ...]
|
|
||||||
|
|
||||||
Specify the page numbers and/or page ranges to be
|
|
||||||
parsed. Example: -p="1 3-5 9", -p="all" (default: 1)
|
|
||||||
|
|
||||||
-f, --format FORMAT
|
|
||||||
|
|
||||||
Output format (csv/xlsx). Example: -f="xlsx" (default: csv)
|
|
||||||
|
|
||||||
-m, --spreadsheet
|
|
||||||
|
|
||||||
Extract tables with ruling lines. (default: False)
|
|
||||||
|
|
||||||
-F, --fill FILL
|
|
||||||
|
|
||||||
Fill the values in empty cells horizontally(h) and/or
|
|
||||||
vertically(v). Example: -F="h", -F="v", -F="hv" (default: None)
|
|
||||||
|
|
||||||
-s, --scale [SCALE]
|
|
||||||
|
|
||||||
Scaling factor. Large scaling factor leads to smaller
|
|
||||||
lines being detected. (default: 15)
|
|
||||||
|
|
||||||
-j, --jtol [JTOL]
|
|
||||||
|
|
||||||
Tolerance to account for when comparing joint and line
|
|
||||||
coordinates. (default: 2)
|
|
||||||
|
|
||||||
-M, --mtol [MTOL]
|
|
||||||
|
|
||||||
Tolerance to account for when merging lines which are
|
|
||||||
very close. (default: 2)
|
|
||||||
|
|
||||||
-i, --invert
|
|
||||||
|
|
||||||
Make sure lines are in foreground. (default: False)
|
|
||||||
|
|
||||||
-d, --debug DEBUG
|
|
||||||
|
|
||||||
Debug by visualizing contours, lines, joints, tables.
|
|
||||||
Example: --debug="contours"
|
|
||||||
|
|
||||||
-o, --output OUTPUT
|
|
||||||
|
|
||||||
Specify output directory.
|
|
||||||
|
|
||||||
## Development
|
## Development
|
||||||
|
|
||||||
|
|
|
||||||
80
basic.py
80
basic.py
|
|
@ -1,80 +0,0 @@
|
||||||
import os
|
|
||||||
import csv
|
|
||||||
import numpy as np
|
|
||||||
|
|
||||||
from pdf import get_pdf_info
|
|
||||||
|
|
||||||
|
|
||||||
def overlap(l):
|
|
||||||
merged = []
|
|
||||||
for higher in l:
|
|
||||||
if not merged:
|
|
||||||
merged.append(higher)
|
|
||||||
else:
|
|
||||||
lower = merged[-1]
|
|
||||||
if higher[0] <= lower[1]:
|
|
||||||
upper_bound = max(lower[1], higher[1])
|
|
||||||
lower_bound = min(lower[0], higher[0])
|
|
||||||
merged[-1] = (lower_bound, upper_bound)
|
|
||||||
else:
|
|
||||||
merged.append(higher)
|
|
||||||
return merged
|
|
||||||
|
|
||||||
|
|
||||||
def get_row_idx(t, rows):
|
|
||||||
for r in range(len(rows)):
|
|
||||||
if t.y1 <= rows[r][0] and t.y0 >= rows[r][1]:
|
|
||||||
return r
|
|
||||||
|
|
||||||
|
|
||||||
def get_column_idx(t, columns):
|
|
||||||
for c in range(len(columns)):
|
|
||||||
if t.x0 >= columns[c][0] and t.x1 <= columns[c][1]:
|
|
||||||
return c
|
|
||||||
|
|
||||||
|
|
||||||
def basic(pdf_dir, filename, char_margin, line_margin, word_margin):
|
|
||||||
print "working on", filename
|
|
||||||
text, _, _ = get_pdf_info(os.path.join(pdf_dir, filename), 'basic',
|
|
||||||
char_margin, line_margin, word_margin)
|
|
||||||
text.sort(key=lambda x: (-x.y0, x.x0))
|
|
||||||
y_last = 0
|
|
||||||
data = []
|
|
||||||
temp = []
|
|
||||||
elements = []
|
|
||||||
for t in text:
|
|
||||||
# is checking for upright necessary?
|
|
||||||
# if t.get_text().strip() and all([obj.upright for obj in t._objs if
|
|
||||||
# type(obj) is LTChar]):
|
|
||||||
if t.get_text().strip():
|
|
||||||
if not np.isclose(y_last, t.y0, atol=2):
|
|
||||||
y_last = t.y0
|
|
||||||
elements.append(len(temp))
|
|
||||||
data.append(temp)
|
|
||||||
temp = []
|
|
||||||
temp.append(t)
|
|
||||||
# a table can't have just 1 column, can it?
|
|
||||||
elements = filter(lambda x: x != 1, elements)
|
|
||||||
# mode = int(sys.argv[2]) if sys.argv[2] else max(set(elements), key=elements.count)
|
|
||||||
mode = max(set(elements), key=elements.count)
|
|
||||||
columns = [(t.x0, t.x1) for d in data for t in d if len(d) == mode]
|
|
||||||
columns = overlap(sorted(columns))
|
|
||||||
columns = [(c[0] + c[1]) / 2.0 for c in columns]
|
|
||||||
|
|
||||||
output = [['' for c in columns] for d in data]
|
|
||||||
for row, d in enumerate(data):
|
|
||||||
for t in d:
|
|
||||||
cog = (t.x0 + t.x1) / 2.0
|
|
||||||
diff = [(i, abs(cog - c)) for i, c in enumerate(columns)]
|
|
||||||
idx = min(diff, key=lambda x: x[1])
|
|
||||||
if output[row][idx[0]]:
|
|
||||||
output[row][idx[0]] += ' ' + t.get_text().strip()
|
|
||||||
else:
|
|
||||||
output[row][idx[0]] = t.get_text().strip()
|
|
||||||
|
|
||||||
csvname = filename.split('.')[0] + '.csv'
|
|
||||||
csvpath = os.path.join(pdf_dir, csvname)
|
|
||||||
with open(csvpath, 'w') as outfile:
|
|
||||||
writer = csv.writer(outfile, quoting=csv.QUOTE_ALL)
|
|
||||||
for row in output:
|
|
||||||
writer.writerow([cell.encode('utf-8') for cell in row])
|
|
||||||
|
|
@ -1,136 +1,258 @@
|
||||||
|
#!/usr/bin/env python2
|
||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
|
import csv
|
||||||
|
import sys
|
||||||
import glob
|
import glob
|
||||||
import time
|
import time
|
||||||
import shutil
|
import shutil
|
||||||
import logging
|
import logging
|
||||||
|
import zipfile
|
||||||
|
import tempfile
|
||||||
import subprocess
|
import subprocess
|
||||||
import argparse
|
from docopt import docopt
|
||||||
|
from werkzeug.utils import secure_filename
|
||||||
|
|
||||||
from basic import basic
|
from lattice import lattice
|
||||||
from spreadsheet import spreadsheet
|
from stream import stream
|
||||||
|
|
||||||
|
|
||||||
|
doc = """
|
||||||
|
camelot parses tables from PDFs!
|
||||||
|
|
||||||
|
usage:
|
||||||
|
camelot.py [options] <method> [<args>...]
|
||||||
|
|
||||||
|
options:
|
||||||
|
-h, --help Show this screen.
|
||||||
|
-v, --version Show version.
|
||||||
|
-p, --pages <pageno> Comma-separated list of page numbers.
|
||||||
|
Example: -p 1,3-6,10 [default: 1]
|
||||||
|
-f, --format <format> Output format. (csv,xlsx) [default: csv]
|
||||||
|
-l, --log Print log to file.
|
||||||
|
-o, --output <directory> Output directory.
|
||||||
|
|
||||||
|
camelot methods:
|
||||||
|
lattice Looks for lines between data.
|
||||||
|
stream Looks for spaces between data.
|
||||||
|
|
||||||
|
See 'camelot <method> -h' for more information on a specific method.
|
||||||
|
"""
|
||||||
|
|
||||||
|
lattice_doc = """
|
||||||
|
Lattice method looks for lines between data to form a table.
|
||||||
|
|
||||||
|
usage:
|
||||||
|
camelot.py lattice [options] [--] <file>
|
||||||
|
|
||||||
|
options:
|
||||||
|
-F, --fill <fill> Fill data in horizontal and/or vertical spanning
|
||||||
|
cells. Example: -F h, -F v, -F hv
|
||||||
|
-s, --scale <scale> Scaling factor. Large scaling factor leads to
|
||||||
|
smaller lines being detected. [default: 15]
|
||||||
|
-j, --jtol <jtol> Tolerance to account for when comparing joint
|
||||||
|
and line coordinates. [default: 2]
|
||||||
|
-m, --mtol <mtol> Tolerance to account for when merging lines
|
||||||
|
which are very close. [default: 2]
|
||||||
|
-i, --invert Invert pdf image to make sure that lines are
|
||||||
|
in foreground.
|
||||||
|
-d, --debug <debug> Debug by visualizing pdf geometry.
|
||||||
|
(contour,line,joint,table) Example: -d table
|
||||||
|
"""
|
||||||
|
|
||||||
|
stream_doc = """
|
||||||
|
Stream method looks for spaces between data to form a table.
|
||||||
|
|
||||||
|
usage:
|
||||||
|
camelot.py stream [options] [--] <file>
|
||||||
|
|
||||||
|
options:
|
||||||
|
-n, --ncols <ncols> Number of columns. [default: 0]
|
||||||
|
-c, --columns <columns> Comma-separated list of column x-coordinates.
|
||||||
|
Example: -c 10.1,20.2,30.3
|
||||||
|
-M, --cmargin <cmargin> Char margin. Chars closer than cmargin are
|
||||||
|
grouped together to form a word. [default: 2.0]
|
||||||
|
-L, --lmargin <lmargin> Line margin. Lines closer than lmargin are
|
||||||
|
grouped together to form a textbox. [default: 0.5]
|
||||||
|
-W, --wmargin <wmargin> Word margin. Insert blank spaces between chars
|
||||||
|
if distance between words is greater than word
|
||||||
|
margin. [default: 0.1]
|
||||||
|
-d, --debug Debug by visualizing textboxes.
|
||||||
|
"""
|
||||||
|
|
||||||
pno = re.compile(r'\d+')
|
pno = re.compile(r'\d+')
|
||||||
|
|
||||||
|
|
||||||
def mkdir(directory):
|
def filesort(filepath):
|
||||||
if not os.path.isdir(directory):
|
filename = os.path.basename(filepath)
|
||||||
os.makedirs(directory)
|
|
||||||
|
|
||||||
|
|
||||||
def filesort(filename):
|
|
||||||
filename = filename.split('/')[-1]
|
|
||||||
num = pno.findall(filename)
|
num = pno.findall(filename)
|
||||||
if len(num) == 2:
|
if len(num) == 2:
|
||||||
return (int(num[0]), int(num[1]))
|
return (int(num[0]), int(num[1]))
|
||||||
else:
|
else:
|
||||||
return (int(num[0]), 0)
|
return (int(num[0]), 0)
|
||||||
|
|
||||||
start_time = time.time()
|
|
||||||
CAMELOT_DIR = '.camelot/'
|
|
||||||
mkdir(CAMELOT_DIR)
|
|
||||||
|
|
||||||
parser = argparse.ArgumentParser(
|
if __name__ == '__main__':
|
||||||
description='Parse tables from pdfs!', usage='python2 camelot.py [options] file')
|
start_time = time.time()
|
||||||
parser.add_argument('-p', '--pages', nargs='+', action='store', dest='pages',
|
tmpdir = tempfile.mkdtemp()
|
||||||
help='Specify the page numbers and/or page ranges to be parsed. Example: -p="1 3-5 9", -p="all" (default: 1)')
|
|
||||||
parser.add_argument('-f', '--format', nargs=1, action='store', dest='format',
|
|
||||||
help='Output format (csv/xlsx). Example: -f="xlsx" (default: csv)', default=["csv"])
|
|
||||||
parser.add_argument('-s', '--spreadsheet', action='store_true', dest='spreadsheet',
|
|
||||||
help='Extract tables with ruling lines. (default: False)')
|
|
||||||
parser.add_argument('-i', '--fill', action='store', dest='fill',
|
|
||||||
help='Fill the values in empty cells horizontally(h) and/or vertically(v). Example: -F="h", -F="v", -F="hv" (default: None)', default=None)
|
|
||||||
parser.add_argument('-c', '--scale', nargs='?', action='store', dest='scale',
|
|
||||||
help='Scaling factor. Large scaling factor leads to smaller lines being detected. (default: 15)', default=15, type=int)
|
|
||||||
parser.add_argument('-j', '--jtol', nargs='?', action='store',
|
|
||||||
dest='jtol', help='Tolerance to account for when comparing joint and line coordinates. (default: 2)', default=2, type=int)
|
|
||||||
parser.add_argument('-t', '--mtol', nargs='?', action='store',
|
|
||||||
dest='mtol', help='Tolerance to account for when merging lines which are very close. (default: 2)', default=2, type=int)
|
|
||||||
parser.add_argument('-n', '--invert', action='store_true', dest='invert',
|
|
||||||
help='Make sure lines are in foreground. (default: False)')
|
|
||||||
parser.add_argument('-d', '--debug', nargs=1, action='store', dest='debug',
|
|
||||||
help='Debug by visualizing contours, lines, joints, tables. Example: --debug="contours"')
|
|
||||||
parser.add_argument('-M', '--char-margin', nargs='?', action='store', dest='char_margin',
|
|
||||||
help='(default: 2.0)', default=2.0, type=float)
|
|
||||||
parser.add_argument('-L', '--line-margin', nargs='?', action='store', dest='line_margin',
|
|
||||||
help='(default: 0.5)', default=0.5, type=float)
|
|
||||||
parser.add_argument('-W', '--word-margin', nargs='?', action='store', dest='word_margin',
|
|
||||||
help='(default: 0.1)', default=0.1, type=float)
|
|
||||||
parser.add_argument('-o', '--output', nargs=1, action='store', dest='output',
|
|
||||||
help='Specify output directory.')
|
|
||||||
parser.add_argument('file', nargs=1)
|
|
||||||
|
|
||||||
result = parser.parse_args()
|
args = docopt(doc, version='0.1', options_first=True)
|
||||||
|
argv = [args['<method>']] + args['<args>']
|
||||||
|
if args['<method>'] == 'lattice':
|
||||||
|
args.update(docopt(lattice_doc, argv=argv))
|
||||||
|
elif args['<method>'] == 'stream':
|
||||||
|
args.update(docopt(stream_doc, argv=argv))
|
||||||
|
|
||||||
if result.pages:
|
if args['--pages']:
|
||||||
if result.pages == ['all']:
|
if args['--pages'] == ['all']:
|
||||||
p = result.pages
|
p = args['--pages']
|
||||||
else:
|
else:
|
||||||
p = []
|
p = []
|
||||||
for r in result.pages[0].split(' '):
|
for r in args['--pages'].split(','):
|
||||||
if '-' in r:
|
if '-' in r:
|
||||||
a, b = r.split('-')
|
a, b = r.split('-')
|
||||||
a, b = int(a), int(b)
|
a, b = int(a), int(b)
|
||||||
p.extend([str(i) for i in range(a, b + 1)])
|
p.extend([str(i) for i in range(a, b + 1)])
|
||||||
else:
|
else:
|
||||||
p.extend([str(r)])
|
p.extend([str(r)])
|
||||||
else:
|
else:
|
||||||
p = ['1']
|
p = ['1']
|
||||||
p = sorted(set(p))
|
p = sorted(set(p))
|
||||||
|
|
||||||
filename = result.file[0].split('/')[-1]
|
fname = os.path.basename(args['<file>'])
|
||||||
# pdf_dir = os.path.join(CAMELOT_DIR, os.urandom(16).encode('hex'))
|
fname = secure_filename(fname)
|
||||||
pdf_dir = os.path.join(CAMELOT_DIR, filename.split('.')[0])
|
fdir = os.path.dirname(args['<file>'])
|
||||||
mkdir(pdf_dir)
|
froot, fext = os.path.splitext(fname)
|
||||||
logging.basicConfig(filename=os.path.join(pdf_dir, filename.split('.')[
|
if fext.lower() != '.pdf':
|
||||||
0] + '.log'), filemode='w', level=logging.DEBUG)
|
print "camelot can parse only pdfs right now"
|
||||||
|
sys.exit()
|
||||||
|
|
||||||
shutil.copy(result.file[0], os.path.join(pdf_dir, filename))
|
logfname = os.path.join(tmpdir, froot + '.log')
|
||||||
print "separating pdf into pages"
|
logging.basicConfig(filename=logfname, filemode='w', level=logging.DEBUG)
|
||||||
print
|
|
||||||
if p == ['all']:
|
shutil.copy(args['<file>'], os.path.join(tmpdir, fname))
|
||||||
subprocess.call(['pdfseparate', os.path.join(
|
print "separating pdf into pages"
|
||||||
pdf_dir, filename), os.path.join(pdf_dir, 'pg-%d.pdf')])
|
print
|
||||||
else:
|
if p == ['all']:
|
||||||
|
subprocess.call(['pdfseparate', os.path.join(tmpdir, fname), os.path.join(tmpdir,
|
||||||
|
'pg-%d.pdf')])
|
||||||
|
else:
|
||||||
for page in p:
|
for page in p:
|
||||||
subprocess.call(['pdfseparate', '-f', page, '-l', page, os.path.join(
|
subprocess.call(['pdfseparate', '-f', page, '-l', page, os.path.join(tmpdir, fname),
|
||||||
pdf_dir, filename), os.path.join(pdf_dir, 'pg-' + page + '.pdf')])
|
os.path.join(tmpdir, 'pg-%s.pdf' % page)])
|
||||||
|
|
||||||
if result.spreadsheet:
|
glob_pdf = sorted(glob.glob(os.path.join(tmpdir, 'pg-*.pdf')))
|
||||||
print "using the spreadsheet method"
|
if args['<method>'] == 'lattice':
|
||||||
for g in sorted(glob.glob(os.path.join(pdf_dir, 'pg-*.pdf'))):
|
print "using the lattice method"
|
||||||
print "converting", g.split('/')[-1], "to image"
|
for g in glob_pdf:
|
||||||
|
g_fname = os.path.basename(g)
|
||||||
|
g_froot, __ = os.path.splitext(g)
|
||||||
|
print "converting %s to image" % g_fname
|
||||||
os.system(' '.join(['convert', '-density', '300',
|
os.system(' '.join(['convert', '-density', '300',
|
||||||
g, '-depth', '8', g[:-4] + '.png']))
|
g, '-depth', '8', g_froot + '.png']))
|
||||||
try:
|
try:
|
||||||
spreadsheet(pdf_dir, g.split('/')[-1], result.fill, result.scale,
|
data = lattice(g, f=args['--fill'], s=int(args['--scale']),
|
||||||
result.jtol, result.mtol, result.invert, result.debug,
|
jtol=int(args['--jtol']), mtol=int(args['--mtol']),
|
||||||
result.char_margin, result.line_margin, result.word_margin)
|
invert=args['--invert'], debug=args['--debug'])
|
||||||
except:
|
if data is None:
|
||||||
logging.error("Couldn't parse " + g.split('/')[-1])
|
print
|
||||||
print "Couldn't parse", g.split('/')[-1]
|
print "See 'camelot lattice -h' for various parameters you can tweak."
|
||||||
else:
|
sys.exit()
|
||||||
print "using the basic method"
|
for k in sorted(data.keys()):
|
||||||
for g in sorted(glob.glob(os.path.join(pdf_dir, 'pg-*.pdf'))):
|
csvfile = g_froot + '_%s.csv' % k
|
||||||
basic(pdf_dir, g.split('/')[-1], result.char_margin, result.line_margin, result.word_margin)
|
with open(csvfile, 'w') as outfile:
|
||||||
|
writer = csv.writer(outfile)
|
||||||
|
for d in data[k]:
|
||||||
|
writer.writerow([c.encode('utf-8') for c in d])
|
||||||
|
print "saved as", os.path.basename(csvfile)
|
||||||
|
print
|
||||||
|
except Exception:
|
||||||
|
logging.exception("")
|
||||||
|
print "couldn't parse", g_fname, "see log for more info"
|
||||||
|
print
|
||||||
|
elif args['<method>'] == 'stream':
|
||||||
|
print "using the stream method"
|
||||||
|
for g in glob_pdf:
|
||||||
|
g_fname = os.path.basename(g)
|
||||||
|
g_froot, __ = os.path.splitext(g)
|
||||||
|
try:
|
||||||
|
data = stream(g, ncolumns=int(args['--ncols']), columns=args['--columns'],
|
||||||
|
char_margin=float(args['--cmargin']),
|
||||||
|
line_margin=float(args['--lmargin']),
|
||||||
|
word_margin=float(args['--wmargin']),
|
||||||
|
debug=args['--debug'])
|
||||||
|
if data is None:
|
||||||
|
print
|
||||||
|
print "See 'camelot stream -h' for various parameters you can tweak."
|
||||||
|
sys.exit()
|
||||||
|
csvfile = g_froot + '.csv'
|
||||||
|
with open(csvfile, 'w') as outfile:
|
||||||
|
writer = csv.writer(outfile)
|
||||||
|
for d in data:
|
||||||
|
writer.writerow([c.encode('utf-8') for c in d])
|
||||||
|
print "saved as", os.path.basename(csvfile)
|
||||||
|
print
|
||||||
|
except Exception:
|
||||||
|
logging.exception("")
|
||||||
|
print "couldn't parse", g_fname, "see log for more info"
|
||||||
|
print
|
||||||
|
|
||||||
if result.format == ['xlsx']:
|
glob_csv = sorted(glob.glob(os.path.join(tmpdir, '*.csv')), key=filesort)
|
||||||
import csv
|
if args['--format'] == 'csv':
|
||||||
|
if len(glob_csv) == 1:
|
||||||
|
if args['--output']:
|
||||||
|
shutil.copy(glob_csv[0], args['--output'])
|
||||||
|
if args['--log']:
|
||||||
|
shutil.copy(logfname, args['--output'])
|
||||||
|
else:
|
||||||
|
shutil.copy(glob_csv[0], fdir)
|
||||||
|
if args['--log']:
|
||||||
|
shutil.copy(zippath, fdir)
|
||||||
|
else:
|
||||||
|
zipname = froot + '.zip'
|
||||||
|
zippath = os.path.join(tmpdir, zipname)
|
||||||
|
print "zipping 'em up"
|
||||||
|
with zipfile.ZipFile(zippath, 'a', zipfile.ZIP_DEFLATED) as myzip:
|
||||||
|
for g in glob_csv:
|
||||||
|
myzip.write(g, os.path.join(froot, os.path.basename(g)))
|
||||||
|
if args['--output']:
|
||||||
|
shutil.copy(zippath, args['--output'])
|
||||||
|
if args['--log']:
|
||||||
|
shutil.copy(logfname, args['--output'])
|
||||||
|
else:
|
||||||
|
shutil.copy(zippath, fdir)
|
||||||
|
if args['--log']:
|
||||||
|
shutil.copy(zippath, fdir)
|
||||||
|
print
|
||||||
|
elif args['--format'] == 'xlsx':
|
||||||
from pyexcel_xlsx import save_data
|
from pyexcel_xlsx import save_data
|
||||||
from collections import OrderedDict
|
from collections import OrderedDict
|
||||||
data = OrderedDict()
|
data = OrderedDict()
|
||||||
for c in sorted(glob.glob(os.path.join(pdf_dir, '*.csv')), key=filesort):
|
for c in glob_csv:
|
||||||
print "adding", c.split('/')[-1], "to excel file"
|
c_fname = os.path.basename(c)
|
||||||
|
c_froot, __ = os.path.splitext(c)
|
||||||
|
print "adding", c_fname, "to excel file"
|
||||||
with open(c, 'r') as csvfile:
|
with open(c, 'r') as csvfile:
|
||||||
reader = csv.reader(csvfile)
|
reader = csv.reader(csvfile)
|
||||||
data.update({c.split('/')[-1].split('.')
|
c_froot, __ = os.path.splitext(c_fname)
|
||||||
[0]: [row for row in reader]})
|
data.update({c_froot: [row for row in reader]})
|
||||||
xlsxname = filename.split('.')[0] + '.xlsx'
|
xlsxname = froot + '.xlsx'
|
||||||
xlsxpath = os.path.join(pdf_dir, xlsxname)
|
xlsxpath = os.path.join(tmpdir, xlsxname)
|
||||||
save_data(xlsxpath, data)
|
save_data(xlsxpath, data)
|
||||||
|
if args['--output']:
|
||||||
|
shutil.copy(xlsxpath, args['--output'])
|
||||||
|
if args['--log']:
|
||||||
|
shutil.copy(logfname, args['--output'])
|
||||||
|
else:
|
||||||
|
shutil.copy(xlsxpath, fdir)
|
||||||
|
if args['--log']:
|
||||||
|
shutil.copy(zippath, fdir)
|
||||||
print
|
print
|
||||||
print "saved as", xlsxname
|
print "saved as", xlsxname
|
||||||
|
|
||||||
print "finished in", time.time() - start_time, "seconds"
|
print "cleaning up..."
|
||||||
logging.info("Time taken for " + filename + ": " +
|
shutil.rmtree(tmpdir)
|
||||||
|
|
||||||
|
print "finished in", time.time() - start_time, "seconds"
|
||||||
|
logging.info("Time taken for " + fname + ": " +
|
||||||
str(time.time() - start_time) + " seconds")
|
str(time.time() - start_time) + " seconds")
|
||||||
|
|
|
||||||
58
cell.py
58
cell.py
|
|
@ -1,6 +1,44 @@
|
||||||
class Cell:
|
class Cell:
|
||||||
|
"""Cell
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
x1 : int
|
||||||
|
|
||||||
|
y1 : int
|
||||||
|
|
||||||
|
x2 : int
|
||||||
|
|
||||||
|
y2 : int
|
||||||
|
|
||||||
|
Attributes
|
||||||
|
----------
|
||||||
|
lb : tuple
|
||||||
|
|
||||||
|
lt : tuple
|
||||||
|
|
||||||
|
rb : tuple
|
||||||
|
|
||||||
|
rt : tuple
|
||||||
|
|
||||||
|
bbox : tuple
|
||||||
|
|
||||||
|
left : bool
|
||||||
|
|
||||||
|
right : bool
|
||||||
|
|
||||||
|
top : bool
|
||||||
|
|
||||||
|
bottom : bool
|
||||||
|
|
||||||
|
text : string
|
||||||
|
|
||||||
|
spanning_h : bool
|
||||||
|
|
||||||
|
spanning_v : bool
|
||||||
|
"""
|
||||||
def __init__(self, x1, y1, x2, y2):
|
def __init__(self, x1, y1, x2, y2):
|
||||||
|
|
||||||
self.lb = (x1, y1)
|
self.lb = (x1, y1)
|
||||||
self.lt = (x1, y2)
|
self.lt = (x1, y2)
|
||||||
self.rb = (x2, y1)
|
self.rb = (x2, y1)
|
||||||
|
|
@ -15,10 +53,28 @@ class Cell:
|
||||||
self.spanning_v = False
|
self.spanning_v = False
|
||||||
|
|
||||||
def add_text(self, text):
|
def add_text(self, text):
|
||||||
self.text += text
|
"""Add text to cell object.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
text : string
|
||||||
|
"""
|
||||||
|
self.text = ''.join([self.text, text])
|
||||||
|
|
||||||
def get_text(self):
|
def get_text(self):
|
||||||
|
"""Get text from cell object.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
text : string
|
||||||
|
"""
|
||||||
return self.text
|
return self.text
|
||||||
|
|
||||||
def get_bounded_edges(self):
|
def get_bounded_edges(self):
|
||||||
|
"""Get number of edges by which a cell is bounded.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
bounded_edges : int
|
||||||
|
"""
|
||||||
return self.top + self.bottom + self.left + self.right
|
return self.top + self.bottom + self.left + self.right
|
||||||
|
|
|
||||||
|
|
@ -1,37 +1,173 @@
|
||||||
import os
|
import os
|
||||||
import csv
|
|
||||||
import cv2
|
import cv2
|
||||||
import glob
|
import glob
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
from table import Table
|
from table import Table
|
||||||
from pdf import get_pdf_info
|
from pdf import get_pdf_info
|
||||||
from morph_transform import morph_transform
|
|
||||||
from utils import (translate, scale, merge_close_values, get_row_idx,
|
from utils import (translate, scale, merge_close_values, get_row_idx,
|
||||||
get_column_idx, reduce_index, outline, fill, remove_empty)
|
get_column_idx, reduce_index, outline, fill, remove_empty)
|
||||||
|
|
||||||
|
|
||||||
def spreadsheet(pdf_dir, filename, fill, s, jtol, mtol, invert, debug,
|
def morph_transform(img, s=15, invert=False):
|
||||||
char_margin, line_margin, word_margin):
|
"""Morphological Transformation
|
||||||
|
|
||||||
|
Applies a series of morphological operations on the image
|
||||||
|
to find table contours and line segments.
|
||||||
|
http://answers.opencv.org/question/63847/how-to-extract-tables-from-an-image/
|
||||||
|
|
||||||
|
Empirical result for adaptiveThreshold's blockSize=5 and C=-0.2
|
||||||
|
taken from http://pequan.lip6.fr/~bereziat/pima/2012/seuillage/sezgin04.pdf
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
img : ndarray
|
||||||
|
|
||||||
|
s : int, default: 15, optional
|
||||||
|
Scaling factor. Large scaling factor leads to smaller lines
|
||||||
|
being detected.
|
||||||
|
|
||||||
|
invert : bool, default: False, optional
|
||||||
|
Invert pdf image to make sure that lines are in foreground.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
tables : dict
|
||||||
|
Dictionary with table bounding box as key and list of
|
||||||
|
joints found in the table as value.
|
||||||
|
|
||||||
|
v_segments : list
|
||||||
|
List of vertical line segments found in the image.
|
||||||
|
|
||||||
|
h_segments : list
|
||||||
|
List of horizontal line segments found in the image.
|
||||||
|
"""
|
||||||
|
img_x, img_y = img.shape[1], img.shape[0]
|
||||||
|
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
|
||||||
|
|
||||||
|
if invert:
|
||||||
|
threshold = cv2.adaptiveThreshold(
|
||||||
|
gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 15, -0.2)
|
||||||
|
else:
|
||||||
|
threshold = cv2.adaptiveThreshold(np.invert(
|
||||||
|
gray), 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 15, -0.2)
|
||||||
|
vertical = threshold
|
||||||
|
horizontal = threshold
|
||||||
|
|
||||||
|
scale = s
|
||||||
|
verticalsize = vertical.shape[0] / scale
|
||||||
|
horizontalsize = horizontal.shape[1] / scale
|
||||||
|
|
||||||
|
ver = cv2.getStructuringElement(cv2.MORPH_RECT, (1, verticalsize))
|
||||||
|
hor = cv2.getStructuringElement(cv2.MORPH_RECT, (horizontalsize, 1))
|
||||||
|
|
||||||
|
vertical = cv2.erode(vertical, ver, (-1, -1))
|
||||||
|
vertical = cv2.dilate(vertical, ver, (-1, -1))
|
||||||
|
|
||||||
|
horizontal = cv2.erode(horizontal, hor, (-1, -1))
|
||||||
|
horizontal = cv2.dilate(horizontal, hor, (-1, -1))
|
||||||
|
|
||||||
|
mask = vertical + horizontal
|
||||||
|
joints = np.bitwise_and(vertical, horizontal)
|
||||||
|
__, contours, __ = cv2.findContours(
|
||||||
|
mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
||||||
|
contours = sorted(contours, key=cv2.contourArea, reverse=True)[:10]
|
||||||
|
|
||||||
|
tables = {}
|
||||||
|
for c in contours:
|
||||||
|
c_poly = cv2.approxPolyDP(c, 3, True)
|
||||||
|
x, y, w, h = cv2.boundingRect(c_poly)
|
||||||
|
# find number of non-zero values in joints using what boundingRect
|
||||||
|
# returns
|
||||||
|
roi = joints[y : y + h, x : x + w]
|
||||||
|
__, jc, __ = cv2.findContours(
|
||||||
|
roi, cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE)
|
||||||
|
if len(jc) <= 4: # remove contours with less than <=4 joints
|
||||||
|
continue
|
||||||
|
joint_coords = []
|
||||||
|
for j in jc:
|
||||||
|
jx, jy, jw, jh = cv2.boundingRect(j)
|
||||||
|
c1, c2 = x + (2 * jx + jw) / 2, y + (2 * jy + jh) / 2
|
||||||
|
joint_coords.append((c1, c2))
|
||||||
|
tables[(x, y + h, x + w, y)] = joint_coords
|
||||||
|
|
||||||
|
v_segments, h_segments = [], []
|
||||||
|
_, vcontours, _ = cv2.findContours(
|
||||||
|
vertical, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
||||||
|
for vc in vcontours:
|
||||||
|
x, y, w, h = cv2.boundingRect(vc)
|
||||||
|
x1, x2 = x, x + w
|
||||||
|
y1, y2 = y, y + h
|
||||||
|
v_segments.append(((x1 + x2) / 2, y2, (x1 + x2) / 2, y1))
|
||||||
|
|
||||||
|
_, hcontours, _ = cv2.findContours(
|
||||||
|
horizontal, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
||||||
|
for hc in hcontours:
|
||||||
|
x, y, w, h = cv2.boundingRect(hc)
|
||||||
|
x1, x2 = x, x + w
|
||||||
|
y1, y2 = y, y + h
|
||||||
|
h_segments.append((x1, (y1 + y2) / 2, x2, (y1 + y2) / 2))
|
||||||
|
|
||||||
|
return tables, v_segments, h_segments
|
||||||
|
|
||||||
|
|
||||||
|
def lattice(filepath, f=None, s=15, jtol=2, mtol=2, invert=False, debug=None):
|
||||||
|
"""Lattice algorithm
|
||||||
|
|
||||||
|
Makes table using pdf geometry information returned by
|
||||||
|
morph_transform and fills data returned by PDFMiner in table cells.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
filepath : string
|
||||||
|
|
||||||
|
f : string, default: None, optional
|
||||||
|
Fill data in horizontal and/or vertical spanning
|
||||||
|
cells. ('h', 'v', 'hv')
|
||||||
|
|
||||||
|
s : int, default: 15, optional
|
||||||
|
Scaling factor. Large scaling factor leads to smaller lines
|
||||||
|
being detected.
|
||||||
|
|
||||||
|
jtol : int, default: 2, optional
|
||||||
|
Tolerance to account for when comparing joint and line
|
||||||
|
coordinates.
|
||||||
|
|
||||||
|
mtol : int, default: 2, optional
|
||||||
|
Tolerance to account for when merging lines which are
|
||||||
|
very close.
|
||||||
|
|
||||||
|
invert : bool, default: False, optional
|
||||||
|
Invert pdf image to make sure that lines are in foreground.
|
||||||
|
|
||||||
|
debug : string
|
||||||
|
Debug by visualizing pdf geometry.
|
||||||
|
('contour', 'line', 'joint', 'table')
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
output : dict
|
||||||
|
Dictionary with table number as key and list of data as value.
|
||||||
|
"""
|
||||||
if debug:
|
if debug:
|
||||||
import matplotlib.pyplot as plt
|
import matplotlib.pyplot as plt
|
||||||
import matplotlib.patches as patches
|
filename = os.path.basename(filepath)
|
||||||
print "working on", filename
|
print "working on", filename
|
||||||
imagename = os.path.join(pdf_dir, filename.split('.')[0] + '.png')
|
fileroot, __ = os.path.splitext(filepath)
|
||||||
|
imagename = fileroot + '.png'
|
||||||
img = cv2.imread(imagename)
|
img = cv2.imread(imagename)
|
||||||
img_x, img_y = img.shape[1], img.shape[0]
|
img_x, img_y = img.shape[1], img.shape[0]
|
||||||
text, pdf_x, pdf_y = get_pdf_info(
|
text, pdf_x, pdf_y = get_pdf_info(filepath, method='lattice')
|
||||||
os.path.join(pdf_dir, filename), 'spreadsheet',
|
|
||||||
char_margin, line_margin, word_margin)
|
|
||||||
scaling_factor_x = pdf_x / float(img_x)
|
scaling_factor_x = pdf_x / float(img_x)
|
||||||
scaling_factor_y = pdf_y / float(img_y)
|
scaling_factor_y = pdf_y / float(img_y)
|
||||||
tables, v_segments, h_segments = morph_transform(imagename, s, invert)
|
tables, v_segments, h_segments = morph_transform(img, s=s, invert=invert)
|
||||||
|
|
||||||
if debug == ["contours"]:
|
if debug == "contour":
|
||||||
for t in tables.keys():
|
for t in tables.keys():
|
||||||
cv2.rectangle(img, (t[0], t[1]), (t[2], t[3]), (255, 0, 0), 3)
|
cv2.rectangle(img, (t[0], t[1]), (t[2], t[3]), (255, 0, 0), 3)
|
||||||
plt.imshow(img)
|
plt.imshow(img)
|
||||||
if debug == ["joints"]:
|
plt.show()
|
||||||
|
return None
|
||||||
|
if debug == "joint":
|
||||||
x_coord = []
|
x_coord = []
|
||||||
y_coord = []
|
y_coord = []
|
||||||
for k in tables.keys():
|
for k in tables.keys():
|
||||||
|
|
@ -42,6 +178,8 @@ def spreadsheet(pdf_dir, filename, fill, s, jtol, mtol, invert, debug,
|
||||||
plt.plot(x_coord, y_coord, 'ro')
|
plt.plot(x_coord, y_coord, 'ro')
|
||||||
plt.axis([0, max_x + 100, max_y + 100, 0])
|
plt.axis([0, max_x + 100, max_y + 100, 0])
|
||||||
plt.imshow(img)
|
plt.imshow(img)
|
||||||
|
plt.show()
|
||||||
|
return None
|
||||||
|
|
||||||
# detect if vertical
|
# detect if vertical
|
||||||
num_v = [t for t in text if (not t.upright) and t.get_text().strip()]
|
num_v = [t for t in text if (not t.upright) and t.get_text().strip()]
|
||||||
|
|
@ -80,7 +218,8 @@ def spreadsheet(pdf_dir, filename, fill, s, jtol, mtol, invert, debug,
|
||||||
abs(translate(-img_y, h[3])), scaling_factor_y)
|
abs(translate(-img_y, h[3])), scaling_factor_y)
|
||||||
h_segments_new.append((x1, y1, x2, y2))
|
h_segments_new.append((x1, y1, x2, y2))
|
||||||
|
|
||||||
num_tables = 0
|
num_tables = 1
|
||||||
|
output = {}
|
||||||
# sort tables based on y-coord
|
# sort tables based on y-coord
|
||||||
for k in sorted(tables_new.keys(), key=lambda x: x[1], reverse=True):
|
for k in sorted(tables_new.keys(), key=lambda x: x[1], reverse=True):
|
||||||
# find rows and columns that lie in table
|
# find rows and columns that lie in table
|
||||||
|
|
@ -91,19 +230,21 @@ def spreadsheet(pdf_dir, filename, fill, s, jtol, mtol, invert, debug,
|
||||||
h_s = [h for h in h_segments_new if h[0] > lb[0] - 2 and h[2]
|
h_s = [h for h in h_segments_new if h[0] > lb[0] - 2 and h[2]
|
||||||
< rt[0] + 2 and lb[1] - 2 <= h[1] <= rt[1] + 2]
|
< rt[0] + 2 and lb[1] - 2 <= h[1] <= rt[1] + 2]
|
||||||
|
|
||||||
if debug == ["lines"]:
|
if debug == "line":
|
||||||
for v in v_s:
|
for v in v_s:
|
||||||
plt.plot([v[0], v[2]], [v[1], v[3]])
|
plt.plot([v[0], v[2]], [v[1], v[3]])
|
||||||
for h in h_s:
|
for h in h_s:
|
||||||
plt.plot([h[0], h[2]], [h[1], h[3]])
|
plt.plot([h[0], h[2]], [h[1], h[3]])
|
||||||
|
plt.show()
|
||||||
|
return None
|
||||||
|
|
||||||
columns, rows = zip(*tables_new[k])
|
columns, rows = zip(*tables_new[k])
|
||||||
columns, rows = list(columns), list(rows)
|
columns, rows = list(columns), list(rows)
|
||||||
columns.extend([lb[0], rt[0]])
|
columns.extend([lb[0], rt[0]])
|
||||||
rows.extend([lb[1], rt[1]])
|
rows.extend([lb[1], rt[1]])
|
||||||
# sort horizontal and vertical segments
|
# sort horizontal and vertical segments
|
||||||
columns = merge_close_values(sorted(columns), mtol)
|
columns = merge_close_values(sorted(columns), mtol=mtol)
|
||||||
rows = merge_close_values(sorted(rows, reverse=True), mtol)
|
rows = merge_close_values(sorted(rows, reverse=True), mtol=mtol)
|
||||||
# make grid using x and y coord of shortlisted rows and columns
|
# make grid using x and y coord of shortlisted rows and columns
|
||||||
columns = [(columns[i], columns[i + 1])
|
columns = [(columns[i], columns[i + 1])
|
||||||
for i in range(0, len(columns) - 1)]
|
for i in range(0, len(columns) - 1)]
|
||||||
|
|
@ -111,13 +252,13 @@ def spreadsheet(pdf_dir, filename, fill, s, jtol, mtol, invert, debug,
|
||||||
|
|
||||||
table = Table(columns, rows)
|
table = Table(columns, rows)
|
||||||
# light up cell edges
|
# light up cell edges
|
||||||
table = table.set_edges(v_s, h_s, jtol)
|
table = table.set_edges(v_s, h_s, jtol=jtol)
|
||||||
# table set span method
|
# table set span method
|
||||||
table = table.set_spanning()
|
table = table.set_spanning()
|
||||||
# TODO
|
# light up table border
|
||||||
table = outline(table)
|
table = outline(table)
|
||||||
|
|
||||||
if debug == ["tables"]:
|
if debug == "table":
|
||||||
for i in range(len(table.cells)):
|
for i in range(len(table.cells)):
|
||||||
for j in range(len(table.cells[i])):
|
for j in range(len(table.cells[i])):
|
||||||
if table.cells[i][j].left:
|
if table.cells[i][j].left:
|
||||||
|
|
@ -132,8 +273,8 @@ def spreadsheet(pdf_dir, filename, fill, s, jtol, mtol, invert, debug,
|
||||||
if table.cells[i][j].bottom:
|
if table.cells[i][j].bottom:
|
||||||
plt.plot([table.cells[i][j].lb[0], table.cells[i][j].rb[0]],
|
plt.plot([table.cells[i][j].lb[0], table.cells[i][j].rb[0]],
|
||||||
[table.cells[i][j].lb[1], table.cells[i][j].rb[1]])
|
[table.cells[i][j].lb[1], table.cells[i][j].rb[1]])
|
||||||
if debug:
|
|
||||||
plt.show()
|
plt.show()
|
||||||
|
return None
|
||||||
|
|
||||||
# fill text after sorting it
|
# fill text after sorting it
|
||||||
if not rotated:
|
if not rotated:
|
||||||
|
|
@ -152,8 +293,8 @@ def spreadsheet(pdf_dir, filename, fill, s, jtol, mtol, invert, debug,
|
||||||
r_idx, c_idx = reduce_index(table, rotated, r_idx, c_idx)
|
r_idx, c_idx = reduce_index(table, rotated, r_idx, c_idx)
|
||||||
table.cells[r_idx][c_idx].add_text(t.get_text().strip('\n'))
|
table.cells[r_idx][c_idx].add_text(t.get_text().strip('\n'))
|
||||||
|
|
||||||
if fill:
|
if f is not None:
|
||||||
table = fill(table, fill)
|
table = fill(table, f=f)
|
||||||
|
|
||||||
data = []
|
data = []
|
||||||
for i in range(len(table.cells)):
|
for i in range(len(table.cells)):
|
||||||
|
|
@ -165,13 +306,7 @@ def spreadsheet(pdf_dir, filename, fill, s, jtol, mtol, invert, debug,
|
||||||
data = zip(*data[::1])
|
data = zip(*data[::1])
|
||||||
data.reverse()
|
data.reverse()
|
||||||
data = remove_empty(data)
|
data = remove_empty(data)
|
||||||
csvname = filename.split(
|
output['table_%d' % num_tables] = data
|
||||||
'.')[0] + ('_table_%d' % (num_tables + 1)) + '.csv'
|
|
||||||
csvpath = os.path.join(pdf_dir, csvname)
|
|
||||||
with open(csvpath, 'w') as outfile:
|
|
||||||
writer = csv.writer(outfile, quoting=csv.QUOTE_ALL)
|
|
||||||
for d in data:
|
|
||||||
writer.writerow(d)
|
|
||||||
print "saved as", csvname
|
|
||||||
print
|
|
||||||
num_tables += 1
|
num_tables += 1
|
||||||
|
|
||||||
|
return output
|
||||||
|
|
@ -1,75 +0,0 @@
|
||||||
import cv2
|
|
||||||
import numpy as np
|
|
||||||
|
|
||||||
|
|
||||||
def morph_transform(imagename, s, invert):
|
|
||||||
# http://answers.opencv.org/question/63847/how-to-extract-tables-from-an-image/
|
|
||||||
img = cv2.imread(imagename)
|
|
||||||
img_x, img_y = img.shape[1], img.shape[0]
|
|
||||||
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
|
|
||||||
# empirical result taken from
|
|
||||||
# http://pequan.lip6.fr/~bereziat/pima/2012/seuillage/sezgin04.pdf
|
|
||||||
if invert:
|
|
||||||
threshold = cv2.adaptiveThreshold(
|
|
||||||
gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 15, -0.2)
|
|
||||||
else:
|
|
||||||
threshold = cv2.adaptiveThreshold(np.invert(
|
|
||||||
gray), 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 15, -0.2)
|
|
||||||
vertical = threshold
|
|
||||||
horizontal = threshold
|
|
||||||
|
|
||||||
scale = s
|
|
||||||
verticalsize = vertical.shape[0] / scale
|
|
||||||
horizontalsize = horizontal.shape[1] / scale
|
|
||||||
|
|
||||||
ver = cv2.getStructuringElement(cv2.MORPH_RECT, (1, verticalsize))
|
|
||||||
hor = cv2.getStructuringElement(cv2.MORPH_RECT, (horizontalsize, 1))
|
|
||||||
|
|
||||||
vertical = cv2.erode(vertical, ver, (-1, -1))
|
|
||||||
vertical = cv2.dilate(vertical, ver, (-1, -1))
|
|
||||||
|
|
||||||
horizontal = cv2.erode(horizontal, hor, (-1, -1))
|
|
||||||
horizontal = cv2.dilate(horizontal, hor, (-1, -1))
|
|
||||||
|
|
||||||
mask = vertical + horizontal
|
|
||||||
joints = np.bitwise_and(vertical, horizontal)
|
|
||||||
_, contours, _ = cv2.findContours(
|
|
||||||
mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
|
||||||
contours = sorted(contours, key=cv2.contourArea, reverse=True)[:10]
|
|
||||||
|
|
||||||
tables = {}
|
|
||||||
for c in contours:
|
|
||||||
c_poly = cv2.approxPolyDP(c, 3, True)
|
|
||||||
x, y, w, h = cv2.boundingRect(c_poly)
|
|
||||||
# find number of non-zero values in joints using what boundingRect
|
|
||||||
# returns
|
|
||||||
roi = joints[y:y + h, x:x + w]
|
|
||||||
_, jc, _ = cv2.findContours(
|
|
||||||
roi, cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE)
|
|
||||||
if len(jc) <= 4: # remove contours with less than <=4 joints
|
|
||||||
continue
|
|
||||||
joint_coords = []
|
|
||||||
for j in jc:
|
|
||||||
jx, jy, jw, jh = cv2.boundingRect(j)
|
|
||||||
c1, c2 = x + (2 * jx + jw) / 2, y + (2 * jy + jh) / 2
|
|
||||||
joint_coords.append((c1, c2))
|
|
||||||
tables[(x, y + h, x + w, y)] = joint_coords
|
|
||||||
|
|
||||||
v_segments, h_segments = [], []
|
|
||||||
_, vcontours, _ = cv2.findContours(
|
|
||||||
vertical, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
|
||||||
for vc in vcontours:
|
|
||||||
x, y, w, h = cv2.boundingRect(vc)
|
|
||||||
x1, x2 = x, x + w
|
|
||||||
y1, y2 = y, y + h
|
|
||||||
v_segments.append(((x1 + x2) / 2, y2, (x1 + x2) / 2, y1))
|
|
||||||
|
|
||||||
_, hcontours, _ = cv2.findContours(
|
|
||||||
horizontal, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
|
||||||
for hc in hcontours:
|
|
||||||
x, y, w, h = cv2.boundingRect(hc)
|
|
||||||
x1, x2 = x, x + w
|
|
||||||
y1, y2 = y, y + h
|
|
||||||
h_segments.append((x1, (y1 + y2) / 2, x2, (y1 + y2) / 2))
|
|
||||||
|
|
||||||
return tables, v_segments, h_segments
|
|
||||||
73
pdf.py
73
pdf.py
|
|
@ -9,35 +9,86 @@ from pdfminer.converter import PDFPageAggregator
|
||||||
from pdfminer.layout import LAParams, LTChar, LTTextLineHorizontal
|
from pdfminer.layout import LAParams, LTChar, LTTextLineHorizontal
|
||||||
|
|
||||||
|
|
||||||
def parse_text_basic(layout, t=None):
|
def parse_text_stream(layout, t=None):
|
||||||
|
"""Recursively parse pdf layout to get a list of
|
||||||
|
LTTextHorizontal objects.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
layout : object
|
||||||
|
|
||||||
|
t : list
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
t : list
|
||||||
|
"""
|
||||||
if t is None:
|
if t is None:
|
||||||
t = []
|
t = []
|
||||||
try:
|
try:
|
||||||
for obj in layout._objs:
|
for obj in layout._objs:
|
||||||
if type(obj) is LTTextLineHorizontal:
|
if isinstance(obj, LTTextLineHorizontal):
|
||||||
t.append(obj)
|
t.append(obj)
|
||||||
else:
|
else:
|
||||||
t += parse_text_basic(obj)
|
t += parse_text_stream(obj)
|
||||||
except AttributeError:
|
except AttributeError:
|
||||||
pass
|
pass
|
||||||
return t
|
return t
|
||||||
|
|
||||||
|
|
||||||
def parse_text_spreadsheet(layout, t=None):
|
def parse_text_lattice(layout, t=None):
|
||||||
|
"""Recursively parse pdf layout to get a list of
|
||||||
|
LTChar objects.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
layout : object
|
||||||
|
|
||||||
|
t : list
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
t : list
|
||||||
|
"""
|
||||||
if t is None:
|
if t is None:
|
||||||
t = []
|
t = []
|
||||||
try:
|
try:
|
||||||
for obj in layout._objs:
|
for obj in layout._objs:
|
||||||
if type(obj) is LTChar:
|
if isinstance(obj, LTChar):
|
||||||
t.append(obj)
|
t.append(obj)
|
||||||
else:
|
else:
|
||||||
t += parse_text_spreadsheet(obj)
|
t += parse_text_lattice(obj)
|
||||||
except AttributeError:
|
except AttributeError:
|
||||||
pass
|
pass
|
||||||
return t
|
return t
|
||||||
|
|
||||||
|
|
||||||
def get_pdf_info(pdfname, method, char_margin, line_margin, word_margin):
|
def get_pdf_info(pdfname, method=None, char_margin=2.0, line_margin=0.5,
|
||||||
|
word_margin=0.1):
|
||||||
|
"""Get list of text objects along with pdf width and height.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
pdfname : string
|
||||||
|
|
||||||
|
method : string
|
||||||
|
|
||||||
|
char_margin : float
|
||||||
|
|
||||||
|
line_margin : float
|
||||||
|
|
||||||
|
word_margin : float
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
text : list
|
||||||
|
|
||||||
|
pdf_x : int
|
||||||
|
|
||||||
|
pdf_y : int
|
||||||
|
"""
|
||||||
|
if not method:
|
||||||
|
return None
|
||||||
with open(pdfname, 'r') as f:
|
with open(pdfname, 'r') as f:
|
||||||
parser = PDFParser(f)
|
parser = PDFParser(f)
|
||||||
document = PDFDocument(parser)
|
document = PDFDocument(parser)
|
||||||
|
|
@ -52,9 +103,9 @@ def get_pdf_info(pdfname, method, char_margin, line_margin, word_margin):
|
||||||
for page in PDFPage.create_pages(document):
|
for page in PDFPage.create_pages(document):
|
||||||
interpreter.process_page(page)
|
interpreter.process_page(page)
|
||||||
layout = device.get_result()
|
layout = device.get_result()
|
||||||
if method == 'basic':
|
if method == 'stream':
|
||||||
text = parse_text_basic(layout)
|
text = parse_text_stream(layout)
|
||||||
elif method == 'spreadsheet':
|
elif method == 'lattice':
|
||||||
text = parse_text_spreadsheet(layout)
|
text = parse_text_lattice(layout)
|
||||||
pdf_x, pdf_y = layout.bbox[2], layout.bbox[3]
|
pdf_x, pdf_y = layout.bbox[2], layout.bbox[3]
|
||||||
return text, pdf_x, pdf_y
|
return text, pdf_x, pdf_y
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,143 @@
|
||||||
|
import os
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
from pdf import get_pdf_info
|
||||||
|
|
||||||
|
|
||||||
|
def overlap(l):
|
||||||
|
"""Groups overlapping columns and returns list with updated
|
||||||
|
columns boundaries.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
l : list
|
||||||
|
List of column x-coordinates.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
merged : list
|
||||||
|
List of merged column x-coordinates.
|
||||||
|
"""
|
||||||
|
merged = []
|
||||||
|
for higher in l:
|
||||||
|
if not merged:
|
||||||
|
merged.append(higher)
|
||||||
|
else:
|
||||||
|
lower = merged[-1]
|
||||||
|
if higher[0] <= lower[1]:
|
||||||
|
upper_bound = max(lower[1], higher[1])
|
||||||
|
lower_bound = min(lower[0], higher[0])
|
||||||
|
merged[-1] = (lower_bound, upper_bound)
|
||||||
|
else:
|
||||||
|
merged.append(higher)
|
||||||
|
return merged
|
||||||
|
|
||||||
|
|
||||||
|
def stream(filepath, ncolumns=0, columns=None, char_margin=2.0,
|
||||||
|
line_margin=0.5, word_margin=0.1, debug=False):
|
||||||
|
"""Stream algorithm
|
||||||
|
|
||||||
|
Groups data returned by PDFMiner into rows and finds mode of the
|
||||||
|
number of elements in each row to guess number of columns.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
filepath : string
|
||||||
|
|
||||||
|
ncolumns : int, default: 0, optional
|
||||||
|
Number of columns.
|
||||||
|
|
||||||
|
columns : string, default: None, optional
|
||||||
|
Comma-separated list of column x-coordinates.
|
||||||
|
|
||||||
|
char_margin : float, default: 2.0, optional
|
||||||
|
Char margin. Chars closer than cmargin are grouped together
|
||||||
|
to form a word.
|
||||||
|
|
||||||
|
line_margin : float, default: 0.5, optional
|
||||||
|
Line margin. Lines closer than lmargin are grouped together
|
||||||
|
to form a textbox.
|
||||||
|
|
||||||
|
word_margin : float, default: 0.1, optional
|
||||||
|
Word margin. Insert blank spaces between chars if distance
|
||||||
|
between words is greater than word margin.
|
||||||
|
|
||||||
|
debug : bool, default: False, optional
|
||||||
|
Debug by visualizing textboxes.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
output : list
|
||||||
|
"""
|
||||||
|
filename = os.path.basename(filepath)
|
||||||
|
print "working on", filename
|
||||||
|
text, __, __ = get_pdf_info(filepath, method='stream', char_margin=char_margin,
|
||||||
|
line_margin=line_margin, word_margin=word_margin)
|
||||||
|
text.sort(key=lambda x: (-x.y0, x.x0))
|
||||||
|
y_last = 0
|
||||||
|
data = []
|
||||||
|
temp = []
|
||||||
|
elements = []
|
||||||
|
for t in text:
|
||||||
|
# is checking for upright necessary?
|
||||||
|
# if t.get_text().strip() and all([obj.upright for obj in t._objs if
|
||||||
|
# type(obj) is LTChar]):
|
||||||
|
if t.get_text().strip():
|
||||||
|
if not np.isclose(y_last, t.y0, atol=2):
|
||||||
|
y_last = t.y0
|
||||||
|
elements.append(len(temp))
|
||||||
|
data.append(temp)
|
||||||
|
temp = []
|
||||||
|
temp.append(t)
|
||||||
|
|
||||||
|
if debug:
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
import matplotlib.patches as patches
|
||||||
|
|
||||||
|
fig = plt.figure()
|
||||||
|
ax = fig.add_subplot(111, aspect='equal')
|
||||||
|
xs, ys = [], []
|
||||||
|
for d in data:
|
||||||
|
for t in d:
|
||||||
|
xs.extend([t.x0, t.x1])
|
||||||
|
ys.extend([t.y0, t.y1])
|
||||||
|
ax.add_patch(
|
||||||
|
patches.Rectangle(
|
||||||
|
(t.x0, t.y0),
|
||||||
|
t.x1 - t.x0,
|
||||||
|
t.y1 - t.y0
|
||||||
|
)
|
||||||
|
)
|
||||||
|
ax.set_xlim(min(xs) - 10, max(xs) + 10)
|
||||||
|
ax.set_ylim(min(ys) - 10, max(ys) + 10)
|
||||||
|
plt.show()
|
||||||
|
return None
|
||||||
|
|
||||||
|
if columns:
|
||||||
|
cols = [(float(columns[i]), float(columns[i + 1]))
|
||||||
|
for i in range(0, len(columns) - 1)]
|
||||||
|
cols = [(c[0] + c[1]) / 2.0 for c in cols]
|
||||||
|
else:
|
||||||
|
# a table can't have just 1 column, can it?
|
||||||
|
elements = filter(lambda x: x != 1, elements)
|
||||||
|
mode = ncolumns if ncolumns else max(set(elements), key=elements.count)
|
||||||
|
cols = [(t.x0, t.x1) for d in data for t in d if len(d) == mode]
|
||||||
|
cols = overlap(sorted(cols))
|
||||||
|
cols = [(c[0] + c[1]) / 2.0 for c in cols]
|
||||||
|
|
||||||
|
output = [['' for c in cols] for d in data]
|
||||||
|
for row, d in enumerate(data):
|
||||||
|
for t in d:
|
||||||
|
cog = (t.x0 + t.x1) / 2.0
|
||||||
|
diff = [(i, abs(cog - c)) for i, c in enumerate(cols)]
|
||||||
|
if diff:
|
||||||
|
idx = min(diff, key=lambda x: x[1])
|
||||||
|
else:
|
||||||
|
print "couldn't find a table on this page"
|
||||||
|
return None
|
||||||
|
if output[row][idx[0]]:
|
||||||
|
output[row][idx[0]] += ' ' + t.get_text().strip()
|
||||||
|
else:
|
||||||
|
output[row][idx[0]] = t.get_text().strip()
|
||||||
|
|
||||||
|
return output
|
||||||
73
table.py
73
table.py
|
|
@ -4,14 +4,55 @@ from cell import Cell
|
||||||
|
|
||||||
|
|
||||||
class Table:
|
class Table:
|
||||||
|
"""Table
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
columns : list
|
||||||
|
List of column x-coordinates.
|
||||||
|
|
||||||
|
rows : list
|
||||||
|
List of row y-coordinates.
|
||||||
|
|
||||||
|
Attributes
|
||||||
|
----------
|
||||||
|
cells : list
|
||||||
|
2-D list of cell objects.
|
||||||
|
|
||||||
|
columns : list
|
||||||
|
List of column x-coordinates.
|
||||||
|
|
||||||
|
rows : list
|
||||||
|
List of row y-coordinates.
|
||||||
|
"""
|
||||||
def __init__(self, columns, rows):
|
def __init__(self, columns, rows):
|
||||||
|
|
||||||
self.cells = [[Cell(c[0], r[1], c[1], r[0])
|
self.cells = [[Cell(c[0], r[1], c[1], r[0])
|
||||||
for c in columns] for r in rows]
|
for c in columns] for r in rows]
|
||||||
self.columns = columns
|
self.columns = columns
|
||||||
self.rows = rows
|
self.rows = rows
|
||||||
|
|
||||||
def set_edges(self, vertical, horizontal, jtol):
|
def set_edges(self, vertical, horizontal, jtol=2):
|
||||||
|
"""Set cell edges to True if corresponding line segments
|
||||||
|
are detected in the pdf image.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
vertical : list
|
||||||
|
List of vertical line segments.
|
||||||
|
|
||||||
|
horizontal : list
|
||||||
|
List of horizontal line segments.
|
||||||
|
|
||||||
|
jtol : int, default: 2, optional
|
||||||
|
Tolerance to account for when comparing joint and line
|
||||||
|
coordinates.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
self : object
|
||||||
|
Returns self.
|
||||||
|
"""
|
||||||
for v in vertical:
|
for v in vertical:
|
||||||
# find closest x coord
|
# find closest x coord
|
||||||
# iterate over y coords and find closest points
|
# iterate over y coords and find closest points
|
||||||
|
|
@ -117,6 +158,14 @@ class Table:
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def set_spanning(self):
|
def set_spanning(self):
|
||||||
|
"""Set spanning values of a cell to True if it isn't
|
||||||
|
bounded by four edges.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
self : object
|
||||||
|
Returns self.
|
||||||
|
"""
|
||||||
for i in range(len(self.cells)):
|
for i in range(len(self.cells)):
|
||||||
for j in range(len(self.cells[i])):
|
for j in range(len(self.cells[i])):
|
||||||
bound = self.cells[i][j].get_bounded_edges()
|
bound = self.cells[i][j].get_bounded_edges()
|
||||||
|
|
@ -125,28 +174,38 @@ class Table:
|
||||||
|
|
||||||
elif bound == 3:
|
elif bound == 3:
|
||||||
if not self.cells[i][j].left:
|
if not self.cells[i][j].left:
|
||||||
if self.cells[i][j].right and self.cells[i][j].top and self.cells[i][j].bottom:
|
if (self.cells[i][j].right and
|
||||||
|
self.cells[i][j].top and
|
||||||
|
self.cells[i][j].bottom):
|
||||||
self.cells[i][j].spanning_h = True
|
self.cells[i][j].spanning_h = True
|
||||||
|
|
||||||
elif not self.cells[i][j].right:
|
elif not self.cells[i][j].right:
|
||||||
if self.cells[i][j].left and self.cells[i][j].top and self.cells[i][j].bottom:
|
if (self.cells[i][j].left and
|
||||||
|
self.cells[i][j].top and
|
||||||
|
self.cells[i][j].bottom):
|
||||||
self.cells[i][j].spanning_h = True
|
self.cells[i][j].spanning_h = True
|
||||||
|
|
||||||
elif not self.cells[i][j].top:
|
elif not self.cells[i][j].top:
|
||||||
if self.cells[i][j].left and self.cells[i][j].right and self.cells[i][j].bottom:
|
if (self.cells[i][j].left and
|
||||||
|
self.cells[i][j].right and
|
||||||
|
self.cells[i][j].bottom):
|
||||||
self.cells[i][j].spanning_v = True
|
self.cells[i][j].spanning_v = True
|
||||||
|
|
||||||
elif not self.cells[i][j].bottom:
|
elif not self.cells[i][j].bottom:
|
||||||
if self.cells[i][j].left and self.cells[i][j].right and self.cells[i][j].top:
|
if (self.cells[i][j].left and
|
||||||
|
self.cells[i][j].right and
|
||||||
|
self.cells[i][j].top):
|
||||||
self.cells[i][j].spanning_v = True
|
self.cells[i][j].spanning_v = True
|
||||||
|
|
||||||
elif bound == 2:
|
elif bound == 2:
|
||||||
if self.cells[i][j].left and self.cells[i][j].right:
|
if self.cells[i][j].left and self.cells[i][j].right:
|
||||||
if not self.cells[i][j].top and not self.cells[i][j].bottom:
|
if (not self.cells[i][j].top and
|
||||||
|
not self.cells[i][j].bottom):
|
||||||
self.cells[i][j].spanning_v = True
|
self.cells[i][j].spanning_v = True
|
||||||
|
|
||||||
elif self.cells[i][j].top and self.cells[i][j].bottom:
|
elif self.cells[i][j].top and self.cells[i][j].bottom:
|
||||||
if not self.cells[i][j].left and not self.cells[i][j].right:
|
if (not self.cells[i][j].left and
|
||||||
|
not self.cells[i][j].right):
|
||||||
self.cells[i][j].spanning_h = True
|
self.cells[i][j].spanning_h = True
|
||||||
|
|
||||||
return self
|
return self
|
||||||
|
|
|
||||||
154
utils.py
154
utils.py
|
|
@ -2,16 +2,61 @@ import numpy as np
|
||||||
|
|
||||||
|
|
||||||
def translate(x1, x2):
|
def translate(x1, x2):
|
||||||
|
"""Translate coordinate x2 by x1.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
x1 : float
|
||||||
|
|
||||||
|
x2 : float
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
x2 : float
|
||||||
|
"""
|
||||||
x2 += x1
|
x2 += x1
|
||||||
return x2
|
return x2
|
||||||
|
|
||||||
|
|
||||||
def scale(x, s):
|
def scale(x, s):
|
||||||
|
"""Scale coordinate x by scaling factor s.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
x : float
|
||||||
|
|
||||||
|
s : float
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
x : float
|
||||||
|
"""
|
||||||
x *= s
|
x *= s
|
||||||
return x
|
return x
|
||||||
|
|
||||||
|
|
||||||
def rotate(x1, y1, x2, y2, angle):
|
def rotate(x1, y1, x2, y2, angle):
|
||||||
|
"""Rotate point x2, y2 about point x1, y1 by angle.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
x1 : float
|
||||||
|
|
||||||
|
y1 : float
|
||||||
|
|
||||||
|
x2 : float
|
||||||
|
|
||||||
|
y2 : float
|
||||||
|
|
||||||
|
angle : float
|
||||||
|
Angle in radians.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
xnew : float
|
||||||
|
|
||||||
|
ynew : float
|
||||||
|
"""
|
||||||
s = np.sin(angle)
|
s = np.sin(angle)
|
||||||
c = np.cos(angle)
|
c = np.cos(angle)
|
||||||
x2 = translate(-x1, x2)
|
x2 = translate(-x1, x2)
|
||||||
|
|
@ -23,7 +68,20 @@ def rotate(x1, y1, x2, y2, angle):
|
||||||
return xnew, ynew
|
return xnew, ynew
|
||||||
|
|
||||||
|
|
||||||
def remove_close_values(ar, mtol):
|
def remove_close_values(ar, mtol=2):
|
||||||
|
"""Remove values which are within a tolerance of mtol of another value
|
||||||
|
present in list.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
ar : list
|
||||||
|
|
||||||
|
mtol : int, default: 2, optional
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
ret : list
|
||||||
|
"""
|
||||||
ret = []
|
ret = []
|
||||||
for a in ar:
|
for a in ar:
|
||||||
if not ret:
|
if not ret:
|
||||||
|
|
@ -37,7 +95,20 @@ def remove_close_values(ar, mtol):
|
||||||
return ret
|
return ret
|
||||||
|
|
||||||
|
|
||||||
def merge_close_values(ar, mtol):
|
def merge_close_values(ar, mtol=2):
|
||||||
|
"""Merge values which are within a tolerance of mtol by calculating
|
||||||
|
a moving mean.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
ar : list
|
||||||
|
|
||||||
|
mtol : int, default: 2, optional
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
ret : list
|
||||||
|
"""
|
||||||
ret = []
|
ret = []
|
||||||
for a in ar:
|
for a in ar:
|
||||||
if not ret:
|
if not ret:
|
||||||
|
|
@ -53,18 +124,63 @@ def merge_close_values(ar, mtol):
|
||||||
|
|
||||||
|
|
||||||
def get_row_idx(t, rows):
|
def get_row_idx(t, rows):
|
||||||
|
"""Get index of the row in which the given object falls by
|
||||||
|
comparing their co-ordinates.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
t : object
|
||||||
|
|
||||||
|
rows : list
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
r : int
|
||||||
|
"""
|
||||||
for r in range(len(rows)):
|
for r in range(len(rows)):
|
||||||
if (t.y0 + t.y1) / 2.0 < rows[r][0] and (t.y0 + t.y1) / 2.0 > rows[r][1]:
|
if (t.y0 + t.y1) / 2.0 < rows[r][0] and (t.y0 + t.y1) / 2.0 > rows[r][1]:
|
||||||
return r
|
return r
|
||||||
|
|
||||||
|
|
||||||
def get_column_idx(t, columns):
|
def get_column_idx(t, columns):
|
||||||
|
"""Get index of the column in which the given object falls by
|
||||||
|
comparing their co-ordinates.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
t : object
|
||||||
|
|
||||||
|
columns : list
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
c : int
|
||||||
|
"""
|
||||||
for c in range(len(columns)):
|
for c in range(len(columns)):
|
||||||
if (t.x0 + t.x1) / 2.0 > columns[c][0] and (t.x0 + t.x1) / 2.0 < columns[c][1]:
|
if (t.x0 + t.x1) / 2.0 > columns[c][0] and (t.x0 + t.x1) / 2.0 < columns[c][1]:
|
||||||
return c
|
return c
|
||||||
|
|
||||||
|
|
||||||
def reduce_index(t, rotated, r_idx, c_idx):
|
def reduce_index(t, rotated, r_idx, c_idx):
|
||||||
|
"""Shift a text object if it lies within a spanning cell taking
|
||||||
|
in account table rotation.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
t : object
|
||||||
|
|
||||||
|
rotated : string
|
||||||
|
|
||||||
|
r_idx : int
|
||||||
|
|
||||||
|
c_idx : int
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
r_idx : int
|
||||||
|
|
||||||
|
c_idx : int
|
||||||
|
"""
|
||||||
if not rotated:
|
if not rotated:
|
||||||
if t.cells[r_idx][c_idx].spanning_h:
|
if t.cells[r_idx][c_idx].spanning_h:
|
||||||
while not t.cells[r_idx][c_idx].left:
|
while not t.cells[r_idx][c_idx].left:
|
||||||
|
|
@ -90,6 +206,16 @@ def reduce_index(t, rotated, r_idx, c_idx):
|
||||||
|
|
||||||
|
|
||||||
def outline(t):
|
def outline(t):
|
||||||
|
"""Light up table boundary.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
t : object
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
t : object
|
||||||
|
"""
|
||||||
for i in range(len(t.cells)):
|
for i in range(len(t.cells)):
|
||||||
t.cells[i][0].left = True
|
t.cells[i][0].left = True
|
||||||
t.cells[i][len(t.cells[i]) - 1].right = True
|
t.cells[i][len(t.cells[i]) - 1].right = True
|
||||||
|
|
@ -99,7 +225,19 @@ def outline(t):
|
||||||
return t
|
return t
|
||||||
|
|
||||||
|
|
||||||
def fill(t, f):
|
def fill(t, f=None):
|
||||||
|
"""Fill spanning cells.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
t : object
|
||||||
|
|
||||||
|
f : string, default: None, optional
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
t : object
|
||||||
|
"""
|
||||||
if f == "h":
|
if f == "h":
|
||||||
for i in range(len(t.cells)):
|
for i in range(len(t.cells)):
|
||||||
for j in range(len(t.cells[i])):
|
for j in range(len(t.cells[i])):
|
||||||
|
|
@ -124,6 +262,16 @@ def fill(t, f):
|
||||||
|
|
||||||
|
|
||||||
def remove_empty(d):
|
def remove_empty(d):
|
||||||
|
"""Remove empty rows and columns.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
d : list
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
d : list
|
||||||
|
"""
|
||||||
for i, row in enumerate(d):
|
for i, row in enumerate(d):
|
||||||
if row == [''] * len(row):
|
if row == [''] * len(row):
|
||||||
d.pop(i)
|
d.pop(i)
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue