diff --git a/.gitignore b/.gitignore
index fefd514..e5bdc6b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,5 @@
__pycache__/
*.py[cod]
-.camelot/
+*.so
+
+.camelot/
\ No newline at end of file
diff --git a/README.md b/README.md
index 1a59793..e6712e7 100644
--- a/README.md
+++ b/README.md
@@ -14,63 +14,24 @@ camelot also uses poppler-utils, more specifically `pdfseparate` to separate a p
## Usage
-python2 camelot.py [options] file
+
+camelot.py [options] [...]
-positional arguments:
+options:
+ -h, --help Show this screen.
+ -v, --version Show version.
+ -p, --pages <pageno> Comma-separated list of page numbers.
+ Example: -p 1,3-6,10 [default: 1]
+ -f, --format <format> Output format. (csv,xlsx) [default: csv]
+ -l, --log Print log to file.
+ -o, --output <directory> Output directory.
- file
+camelot methods:
+ lattice Looks for lines between data.
+ stream Looks for spaces between data.
-optional arguments:
-
- -h, --help
-
- show this help message and exit
-
- -p, --pages PAGES [PAGES ...]
-
- Specify the page numbers and/or page ranges to be
- parsed. Example: -p="1 3-5 9", -p="all" (default: 1)
-
- -f, --format FORMAT
-
- Output format (csv/xlsx). Example: -f="xlsx" (default: csv)
-
- -m, --spreadsheet
-
- Extract tables with ruling lines. (default: False)
-
- -F, --fill FILL
-
- Fill the values in empty cells horizontally(h) and/or
- vertically(v). Example: -F="h", -F="v", -F="hv" (default: None)
-
- -s, --scale [SCALE]
-
- Scaling factor. Large scaling factor leads to smaller
- lines being detected. (default: 15)
-
- -j, --jtol [JTOL]
-
- Tolerance to account for when comparing joint and line
- coordinates. (default: 2)
-
- -M, --mtol [MTOL]
-
- Tolerance to account for when merging lines which are
- very close. (default: 2)
-
- -i, --invert
-
- Make sure lines are in foreground. (default: False)
-
- -d, --debug DEBUG
-
- Debug by visualizing contours, lines, joints, tables.
- Example: --debug="contours"
-
- -o, --output OUTPUT
-
- Specify output directory.
+See 'camelot -h' for more information on a specific method.
+
## Development
diff --git a/basic.py b/basic.py
deleted file mode 100644
index e2ff777..0000000
--- a/basic.py
+++ /dev/null
@@ -1,80 +0,0 @@
-import os
-import csv
-import numpy as np
-
-from pdf import get_pdf_info
-
-
-def overlap(l):
- merged = []
- for higher in l:
- if not merged:
- merged.append(higher)
- else:
- lower = merged[-1]
- if higher[0] <= lower[1]:
- upper_bound = max(lower[1], higher[1])
- lower_bound = min(lower[0], higher[0])
- merged[-1] = (lower_bound, upper_bound)
- else:
- merged.append(higher)
- return merged
-
-
-def get_row_idx(t, rows):
- for r in range(len(rows)):
- if t.y1 <= rows[r][0] and t.y0 >= rows[r][1]:
- return r
-
-
-def get_column_idx(t, columns):
- for c in range(len(columns)):
- if t.x0 >= columns[c][0] and t.x1 <= columns[c][1]:
- return c
-
-
-def basic(pdf_dir, filename, char_margin, line_margin, word_margin):
- print "working on", filename
- text, _, _ = get_pdf_info(os.path.join(pdf_dir, filename), 'basic',
- char_margin, line_margin, word_margin)
- text.sort(key=lambda x: (-x.y0, x.x0))
- y_last = 0
- data = []
- temp = []
- elements = []
- for t in text:
- # is checking for upright necessary?
- # if t.get_text().strip() and all([obj.upright for obj in t._objs if
- # type(obj) is LTChar]):
- if t.get_text().strip():
- if not np.isclose(y_last, t.y0, atol=2):
- y_last = t.y0
- elements.append(len(temp))
- data.append(temp)
- temp = []
- temp.append(t)
- # a table can't have just 1 column, can it?
- elements = filter(lambda x: x != 1, elements)
- # mode = int(sys.argv[2]) if sys.argv[2] else max(set(elements), key=elements.count)
- mode = max(set(elements), key=elements.count)
- columns = [(t.x0, t.x1) for d in data for t in d if len(d) == mode]
- columns = overlap(sorted(columns))
- columns = [(c[0] + c[1]) / 2.0 for c in columns]
-
- output = [['' for c in columns] for d in data]
- for row, d in enumerate(data):
- for t in d:
- cog = (t.x0 + t.x1) / 2.0
- diff = [(i, abs(cog - c)) for i, c in enumerate(columns)]
- idx = min(diff, key=lambda x: x[1])
- if output[row][idx[0]]:
- output[row][idx[0]] += ' ' + t.get_text().strip()
- else:
- output[row][idx[0]] = t.get_text().strip()
-
- csvname = filename.split('.')[0] + '.csv'
- csvpath = os.path.join(pdf_dir, csvname)
- with open(csvpath, 'w') as outfile:
- writer = csv.writer(outfile, quoting=csv.QUOTE_ALL)
- for row in output:
- writer.writerow([cell.encode('utf-8') for cell in row])
diff --git a/camelot.py b/camelot.py
old mode 100644
new mode 100755
index 7efd914..44e9dee
--- a/camelot.py
+++ b/camelot.py
@@ -1,136 +1,258 @@
+#!/usr/bin/env python2
import os
import re
+import csv
+import sys
import glob
import time
import shutil
import logging
+import zipfile
+import tempfile
import subprocess
-import argparse
+from docopt import docopt
+from werkzeug.utils import secure_filename
-from basic import basic
-from spreadsheet import spreadsheet
+from lattice import lattice
+from stream import stream
+
+
+doc = """
+camelot parses tables from PDFs!
+
+usage:
+ camelot.py [options] [...]
+
+options:
+ -h, --help Show this screen.
+ -v, --version Show version.
+ -p, --pages Comma-separated list of page numbers.
+ Example: -p 1,3-6,10 [default: 1]
+ -f, --format Output format. (csv,xlsx) [default: csv]
+ -l, --log Print log to file.
+ -o, --output Output directory.
+
+camelot methods:
+ lattice Looks for lines between data.
+ stream Looks for spaces between data.
+
+See 'camelot -h' for more information on a specific method.
+"""
+
+lattice_doc = """
+Lattice method looks for lines between data to form a table.
+
+usage:
+ camelot.py lattice [options] [--]
+
+options:
+ -F, --fill Fill data in horizontal and/or vertical spanning
+ cells. Example: -F h, -F v, -F hv
+ -s, --scale Scaling factor. Large scaling factor leads to
+ smaller lines being detected. [default: 15]
+ -j, --jtol Tolerance to account for when comparing joint
+ and line coordinates. [default: 2]
+ -m, --mtol Tolerance to account for when merging lines
+ which are very close. [default: 2]
+ -i, --invert Invert pdf image to make sure that lines are
+ in foreground.
+ -d, --debug Debug by visualizing pdf geometry.
+ (contour,line,joint,table) Example: -d table
+"""
+
+stream_doc = """
+Stream method looks for spaces between data to form a table.
+
+usage:
+ camelot.py stream [options] [--]
+
+options:
+ -n, --ncols Number of columns. [default: 0]
+ -c, --columns Comma-separated list of column x-coordinates.
+ Example: -c 10.1,20.2,30.3
+ -M, --cmargin Char margin. Chars closer than cmargin are
+ grouped together to form a word. [default: 2.0]
+ -L, --lmargin Line margin. Lines closer than lmargin are
+ grouped together to form a textbox. [default: 0.5]
+ -W, --wmargin Word margin. Insert blank spaces between chars
+ if distance between words is greater than word
+ margin. [default: 0.1]
+ -d, --debug Debug by visualizing textboxes.
+"""
pno = re.compile(r'\d+')
-def mkdir(directory):
- if not os.path.isdir(directory):
- os.makedirs(directory)
-
-
-def filesort(filename):
- filename = filename.split('/')[-1]
+def filesort(filepath):
+ filename = os.path.basename(filepath)
num = pno.findall(filename)
if len(num) == 2:
return (int(num[0]), int(num[1]))
else:
return (int(num[0]), 0)
-start_time = time.time()
-CAMELOT_DIR = '.camelot/'
-mkdir(CAMELOT_DIR)
-parser = argparse.ArgumentParser(
- description='Parse tables from pdfs!', usage='python2 camelot.py [options] file')
-parser.add_argument('-p', '--pages', nargs='+', action='store', dest='pages',
- help='Specify the page numbers and/or page ranges to be parsed. Example: -p="1 3-5 9", -p="all" (default: 1)')
-parser.add_argument('-f', '--format', nargs=1, action='store', dest='format',
- help='Output format (csv/xlsx). Example: -f="xlsx" (default: csv)', default=["csv"])
-parser.add_argument('-s', '--spreadsheet', action='store_true', dest='spreadsheet',
- help='Extract tables with ruling lines. (default: False)')
-parser.add_argument('-i', '--fill', action='store', dest='fill',
- help='Fill the values in empty cells horizontally(h) and/or vertically(v). Example: -F="h", -F="v", -F="hv" (default: None)', default=None)
-parser.add_argument('-c', '--scale', nargs='?', action='store', dest='scale',
- help='Scaling factor. Large scaling factor leads to smaller lines being detected. (default: 15)', default=15, type=int)
-parser.add_argument('-j', '--jtol', nargs='?', action='store',
- dest='jtol', help='Tolerance to account for when comparing joint and line coordinates. (default: 2)', default=2, type=int)
-parser.add_argument('-t', '--mtol', nargs='?', action='store',
- dest='mtol', help='Tolerance to account for when merging lines which are very close. (default: 2)', default=2, type=int)
-parser.add_argument('-n', '--invert', action='store_true', dest='invert',
- help='Make sure lines are in foreground. (default: False)')
-parser.add_argument('-d', '--debug', nargs=1, action='store', dest='debug',
- help='Debug by visualizing contours, lines, joints, tables. Example: --debug="contours"')
-parser.add_argument('-M', '--char-margin', nargs='?', action='store', dest='char_margin',
- help='(default: 2.0)', default=2.0, type=float)
-parser.add_argument('-L', '--line-margin', nargs='?', action='store', dest='line_margin',
- help='(default: 0.5)', default=0.5, type=float)
-parser.add_argument('-W', '--word-margin', nargs='?', action='store', dest='word_margin',
- help='(default: 0.1)', default=0.1, type=float)
-parser.add_argument('-o', '--output', nargs=1, action='store', dest='output',
- help='Specify output directory.')
-parser.add_argument('file', nargs=1)
+if __name__ == '__main__':
+ start_time = time.time()
+ tmpdir = tempfile.mkdtemp()
-result = parser.parse_args()
+ args = docopt(doc, version='0.1', options_first=True)
+ argv = [args['']] + args['']
+ if args[''] == 'lattice':
+ args.update(docopt(lattice_doc, argv=argv))
+ elif args[''] == 'stream':
+ args.update(docopt(stream_doc, argv=argv))
-if result.pages:
- if result.pages == ['all']:
- p = result.pages
+ if args['--pages']:
+ if args['--pages'] == ['all']:
+ p = args['--pages']
+ else:
+ p = []
+ for r in args['--pages'].split(','):
+ if '-' in r:
+ a, b = r.split('-')
+ a, b = int(a), int(b)
+ p.extend([str(i) for i in range(a, b + 1)])
+ else:
+ p.extend([str(r)])
else:
- p = []
- for r in result.pages[0].split(' '):
- if '-' in r:
- a, b = r.split('-')
- a, b = int(a), int(b)
- p.extend([str(i) for i in range(a, b + 1)])
- else:
- p.extend([str(r)])
-else:
- p = ['1']
-p = sorted(set(p))
+ p = ['1']
+ p = sorted(set(p))
-filename = result.file[0].split('/')[-1]
-# pdf_dir = os.path.join(CAMELOT_DIR, os.urandom(16).encode('hex'))
-pdf_dir = os.path.join(CAMELOT_DIR, filename.split('.')[0])
-mkdir(pdf_dir)
-logging.basicConfig(filename=os.path.join(pdf_dir, filename.split('.')[
- 0] + '.log'), filemode='w', level=logging.DEBUG)
+ fname = os.path.basename(args[''])
+ fname = secure_filename(fname)
+ fdir = os.path.dirname(args[''])
+ froot, fext = os.path.splitext(fname)
+ if fext.lower() != '.pdf':
+ print "camelot can parse only pdfs right now"
+ sys.exit()
-shutil.copy(result.file[0], os.path.join(pdf_dir, filename))
-print "separating pdf into pages"
-print
-if p == ['all']:
- subprocess.call(['pdfseparate', os.path.join(
- pdf_dir, filename), os.path.join(pdf_dir, 'pg-%d.pdf')])
-else:
- for page in p:
- subprocess.call(['pdfseparate', '-f', page, '-l', page, os.path.join(
- pdf_dir, filename), os.path.join(pdf_dir, 'pg-' + page + '.pdf')])
+ logfname = os.path.join(tmpdir, froot + '.log')
+ logging.basicConfig(filename=logfname, filemode='w', level=logging.DEBUG)
-if result.spreadsheet:
- print "using the spreadsheet method"
- for g in sorted(glob.glob(os.path.join(pdf_dir, 'pg-*.pdf'))):
- print "converting", g.split('/')[-1], "to image"
- os.system(' '.join(['convert', '-density', '300',
- g, '-depth', '8', g[:-4] + '.png']))
- try:
- spreadsheet(pdf_dir, g.split('/')[-1], result.fill, result.scale,
- result.jtol, result.mtol, result.invert, result.debug,
- result.char_margin, result.line_margin, result.word_margin)
- except:
- logging.error("Couldn't parse " + g.split('/')[-1])
- print "Couldn't parse", g.split('/')[-1]
-else:
- print "using the basic method"
- for g in sorted(glob.glob(os.path.join(pdf_dir, 'pg-*.pdf'))):
- basic(pdf_dir, g.split('/')[-1], result.char_margin, result.line_margin, result.word_margin)
-
-if result.format == ['xlsx']:
- import csv
- from pyexcel_xlsx import save_data
- from collections import OrderedDict
- data = OrderedDict()
- for c in sorted(glob.glob(os.path.join(pdf_dir, '*.csv')), key=filesort):
- print "adding", c.split('/')[-1], "to excel file"
- with open(c, 'r') as csvfile:
- reader = csv.reader(csvfile)
- data.update({c.split('/')[-1].split('.')
- [0]: [row for row in reader]})
- xlsxname = filename.split('.')[0] + '.xlsx'
- xlsxpath = os.path.join(pdf_dir, xlsxname)
- save_data(xlsxpath, data)
+ shutil.copy(args[''], os.path.join(tmpdir, fname))
+ print "separating pdf into pages"
print
- print "saved as", xlsxname
+ if p == ['all']:
+ subprocess.call(['pdfseparate', os.path.join(tmpdir, fname), os.path.join(tmpdir,
+ 'pg-%d.pdf')])
+ else:
+ for page in p:
+ subprocess.call(['pdfseparate', '-f', page, '-l', page, os.path.join(tmpdir, fname),
+ os.path.join(tmpdir, 'pg-%s.pdf' % page)])
-print "finished in", time.time() - start_time, "seconds"
-logging.info("Time taken for " + filename + ": " +
- str(time.time() - start_time) + " seconds")
+ glob_pdf = sorted(glob.glob(os.path.join(tmpdir, 'pg-*.pdf')))
+ if args[''] == 'lattice':
+ print "using the lattice method"
+ for g in glob_pdf:
+ g_fname = os.path.basename(g)
+ g_froot, __ = os.path.splitext(g)
+ print "converting %s to image" % g_fname
+ os.system(' '.join(['convert', '-density', '300',
+ g, '-depth', '8', g_froot + '.png']))
+ try:
+ data = lattice(g, f=args['--fill'], s=int(args['--scale']),
+ jtol=int(args['--jtol']), mtol=int(args['--mtol']),
+ invert=args['--invert'], debug=args['--debug'])
+ if data is None:
+ print
+ print "See 'camelot lattice -h' for various parameters you can tweak."
+ sys.exit()
+ for k in sorted(data.keys()):
+ csvfile = g_froot + '_%s.csv' % k
+ with open(csvfile, 'w') as outfile:
+ writer = csv.writer(outfile)
+ for d in data[k]:
+ writer.writerow([c.encode('utf-8') for c in d])
+ print "saved as", os.path.basename(csvfile)
+ print
+ except Exception:
+ logging.exception("")
+ print "couldn't parse", g_fname, "see log for more info"
+ print
+ elif args[''] == 'stream':
+ print "using the stream method"
+ for g in glob_pdf:
+ g_fname = os.path.basename(g)
+ g_froot, __ = os.path.splitext(g)
+ try:
+ data = stream(g, ncolumns=int(args['--ncols']), columns=args['--columns'],
+ char_margin=float(args['--cmargin']),
+ line_margin=float(args['--lmargin']),
+ word_margin=float(args['--wmargin']),
+ debug=args['--debug'])
+ if data is None:
+ print
+ print "See 'camelot stream -h' for various parameters you can tweak."
+ sys.exit()
+ csvfile = g_froot + '.csv'
+ with open(csvfile, 'w') as outfile:
+ writer = csv.writer(outfile)
+ for d in data:
+ writer.writerow([c.encode('utf-8') for c in d])
+ print "saved as", os.path.basename(csvfile)
+ print
+ except Exception:
+ logging.exception("")
+ print "couldn't parse", g_fname, "see log for more info"
+ print
+
+ glob_csv = sorted(glob.glob(os.path.join(tmpdir, '*.csv')), key=filesort)
+ if args['--format'] == 'csv':
+ if len(glob_csv) == 1:
+ if args['--output']:
+ shutil.copy(glob_csv[0], args['--output'])
+ if args['--log']:
+ shutil.copy(logfname, args['--output'])
+ else:
+ shutil.copy(glob_csv[0], fdir)
+ if args['--log']:
+ shutil.copy(zippath, fdir)
+ else:
+ zipname = froot + '.zip'
+ zippath = os.path.join(tmpdir, zipname)
+ print "zipping 'em up"
+ with zipfile.ZipFile(zippath, 'a', zipfile.ZIP_DEFLATED) as myzip:
+ for g in glob_csv:
+ myzip.write(g, os.path.join(froot, os.path.basename(g)))
+ if args['--output']:
+ shutil.copy(zippath, args['--output'])
+ if args['--log']:
+ shutil.copy(logfname, args['--output'])
+ else:
+ shutil.copy(zippath, fdir)
+ if args['--log']:
+ shutil.copy(zippath, fdir)
+ print
+ elif args['--format'] == 'xlsx':
+ from pyexcel_xlsx import save_data
+ from collections import OrderedDict
+ data = OrderedDict()
+ for c in glob_csv:
+ c_fname = os.path.basename(c)
+ c_froot, __ = os.path.splitext(c)
+ print "adding", c_fname, "to excel file"
+ with open(c, 'r') as csvfile:
+ reader = csv.reader(csvfile)
+ c_froot, __ = os.path.splitext(c_fname)
+ data.update({c_froot: [row for row in reader]})
+ xlsxname = froot + '.xlsx'
+ xlsxpath = os.path.join(tmpdir, xlsxname)
+ save_data(xlsxpath, data)
+ if args['--output']:
+ shutil.copy(xlsxpath, args['--output'])
+ if args['--log']:
+ shutil.copy(logfname, args['--output'])
+ else:
+ shutil.copy(xlsxpath, fdir)
+ if args['--log']:
+ shutil.copy(zippath, fdir)
+ print
+ print "saved as", xlsxname
+
+ print "cleaning up..."
+ shutil.rmtree(tmpdir)
+
+ print "finished in", time.time() - start_time, "seconds"
+ logging.info("Time taken for " + fname + ": " +
+ str(time.time() - start_time) + " seconds")
diff --git a/cell.py b/cell.py
index e2e91cb..ee993ae 100644
--- a/cell.py
+++ b/cell.py
@@ -1,6 +1,44 @@
class Cell:
+ """Cell
+ Parameters
+ ----------
+ x1 : int
+
+ y1 : int
+
+ x2 : int
+
+ y2 : int
+
+ Attributes
+ ----------
+ lb : tuple
+
+ lt : tuple
+
+ rb : tuple
+
+ rt : tuple
+
+ bbox : tuple
+
+ left : bool
+
+ right : bool
+
+ top : bool
+
+ bottom : bool
+
+ text : string
+
+ spanning_h : bool
+
+ spanning_v : bool
+ """
def __init__(self, x1, y1, x2, y2):
+
self.lb = (x1, y1)
self.lt = (x1, y2)
self.rb = (x2, y1)
@@ -15,10 +53,28 @@ class Cell:
self.spanning_v = False
def add_text(self, text):
- self.text += text
+ """Add text to cell object.
+
+ Parameters
+ ----------
+ text : string
+ """
+ self.text = ''.join([self.text, text])
def get_text(self):
+ """Get text from cell object.
+
+ Returns
+ -------
+ text : string
+ """
return self.text
def get_bounded_edges(self):
+ """Get number of edges by which a cell is bounded.
+
+ Returns
+ -------
+ bounded_edges : int
+ """
return self.top + self.bottom + self.left + self.right
diff --git a/spreadsheet.py b/lattice.py
similarity index 51%
rename from spreadsheet.py
rename to lattice.py
index 46ea466..2395dd7 100644
--- a/spreadsheet.py
+++ b/lattice.py
@@ -1,37 +1,173 @@
import os
-import csv
import cv2
import glob
import numpy as np
from table import Table
from pdf import get_pdf_info
-from morph_transform import morph_transform
from utils import (translate, scale, merge_close_values, get_row_idx,
get_column_idx, reduce_index, outline, fill, remove_empty)
-def spreadsheet(pdf_dir, filename, fill, s, jtol, mtol, invert, debug,
- char_margin, line_margin, word_margin):
+def morph_transform(img, s=15, invert=False):
+ """Morphological Transformation
+
+ Applies a series of morphological operations on the image
+ to find table contours and line segments.
+ http://answers.opencv.org/question/63847/how-to-extract-tables-from-an-image/
+
+ Empirical result for adaptiveThreshold's blockSize=5 and C=-0.2
+ taken from http://pequan.lip6.fr/~bereziat/pima/2012/seuillage/sezgin04.pdf
+
+ Parameters
+ ----------
+ img : ndarray
+
+ s : int, default: 15, optional
+ Scaling factor. Large scaling factor leads to smaller lines
+ being detected.
+
+ invert : bool, default: False, optional
+ Invert pdf image to make sure that lines are in foreground.
+
+ Returns
+ -------
+ tables : dict
+ Dictionary with table bounding box as key and list of
+ joints found in the table as value.
+
+ v_segments : list
+ List of vertical line segments found in the image.
+
+ h_segments : list
+ List of horizontal line segments found in the image.
+ """
+ img_x, img_y = img.shape[1], img.shape[0]
+ gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
+
+ if invert:
+ threshold = cv2.adaptiveThreshold(
+ gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 15, -0.2)
+ else:
+ threshold = cv2.adaptiveThreshold(np.invert(
+ gray), 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 15, -0.2)
+ vertical = threshold
+ horizontal = threshold
+
+ scale = s
+ verticalsize = vertical.shape[0] / scale
+ horizontalsize = horizontal.shape[1] / scale
+
+ ver = cv2.getStructuringElement(cv2.MORPH_RECT, (1, verticalsize))
+ hor = cv2.getStructuringElement(cv2.MORPH_RECT, (horizontalsize, 1))
+
+ vertical = cv2.erode(vertical, ver, (-1, -1))
+ vertical = cv2.dilate(vertical, ver, (-1, -1))
+
+ horizontal = cv2.erode(horizontal, hor, (-1, -1))
+ horizontal = cv2.dilate(horizontal, hor, (-1, -1))
+
+ mask = vertical + horizontal
+ joints = np.bitwise_and(vertical, horizontal)
+ __, contours, __ = cv2.findContours(
+ mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+ contours = sorted(contours, key=cv2.contourArea, reverse=True)[:10]
+
+ tables = {}
+ for c in contours:
+ c_poly = cv2.approxPolyDP(c, 3, True)
+ x, y, w, h = cv2.boundingRect(c_poly)
+ # find number of non-zero values in joints using what boundingRect
+ # returns
+ roi = joints[y : y + h, x : x + w]
+ __, jc, __ = cv2.findContours(
+ roi, cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE)
+ if len(jc) <= 4: # remove contours with less than <=4 joints
+ continue
+ joint_coords = []
+ for j in jc:
+ jx, jy, jw, jh = cv2.boundingRect(j)
+ c1, c2 = x + (2 * jx + jw) / 2, y + (2 * jy + jh) / 2
+ joint_coords.append((c1, c2))
+ tables[(x, y + h, x + w, y)] = joint_coords
+
+ v_segments, h_segments = [], []
+ _, vcontours, _ = cv2.findContours(
+ vertical, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+ for vc in vcontours:
+ x, y, w, h = cv2.boundingRect(vc)
+ x1, x2 = x, x + w
+ y1, y2 = y, y + h
+ v_segments.append(((x1 + x2) / 2, y2, (x1 + x2) / 2, y1))
+
+ _, hcontours, _ = cv2.findContours(
+ horizontal, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+ for hc in hcontours:
+ x, y, w, h = cv2.boundingRect(hc)
+ x1, x2 = x, x + w
+ y1, y2 = y, y + h
+ h_segments.append((x1, (y1 + y2) / 2, x2, (y1 + y2) / 2))
+
+ return tables, v_segments, h_segments
+
+
+def lattice(filepath, f=None, s=15, jtol=2, mtol=2, invert=False, debug=None):
+ """Lattice algorithm
+
+ Makes table using pdf geometry information returned by
+ morph_transform and fills data returned by PDFMiner in table cells.
+
+ Parameters
+ ----------
+ filepath : string
+
+ f : string, default: None, optional
+ Fill data in horizontal and/or vertical spanning
+ cells. ('h', 'v', 'hv')
+
+ s : int, default: 15, optional
+ Scaling factor. Large scaling factor leads to smaller lines
+ being detected.
+
+ jtol : int, default: 2, optional
+ Tolerance to account for when comparing joint and line
+ coordinates.
+
+ mtol : int, default: 2, optional
+ Tolerance to account for when merging lines which are
+ very close.
+
+ invert : bool, default: False, optional
+ Invert pdf image to make sure that lines are in foreground.
+
+ debug : string
+ Debug by visualizing pdf geometry.
+ ('contour', 'line', 'joint', 'table')
+ Returns
+ -------
+ output : dict
+ Dictionary with table number as key and list of data as value.
+ """
if debug:
import matplotlib.pyplot as plt
- import matplotlib.patches as patches
+ filename = os.path.basename(filepath)
print "working on", filename
- imagename = os.path.join(pdf_dir, filename.split('.')[0] + '.png')
+ fileroot, __ = os.path.splitext(filepath)
+ imagename = fileroot + '.png'
img = cv2.imread(imagename)
img_x, img_y = img.shape[1], img.shape[0]
- text, pdf_x, pdf_y = get_pdf_info(
- os.path.join(pdf_dir, filename), 'spreadsheet',
- char_margin, line_margin, word_margin)
+ text, pdf_x, pdf_y = get_pdf_info(filepath, method='lattice')
scaling_factor_x = pdf_x / float(img_x)
scaling_factor_y = pdf_y / float(img_y)
- tables, v_segments, h_segments = morph_transform(imagename, s, invert)
+ tables, v_segments, h_segments = morph_transform(img, s=s, invert=invert)
- if debug == ["contours"]:
+ if debug == "contour":
for t in tables.keys():
cv2.rectangle(img, (t[0], t[1]), (t[2], t[3]), (255, 0, 0), 3)
plt.imshow(img)
- if debug == ["joints"]:
+ plt.show()
+ return None
+ if debug == "joint":
x_coord = []
y_coord = []
for k in tables.keys():
@@ -42,6 +178,8 @@ def spreadsheet(pdf_dir, filename, fill, s, jtol, mtol, invert, debug,
plt.plot(x_coord, y_coord, 'ro')
plt.axis([0, max_x + 100, max_y + 100, 0])
plt.imshow(img)
+ plt.show()
+ return None
# detect if vertical
num_v = [t for t in text if (not t.upright) and t.get_text().strip()]
@@ -80,7 +218,8 @@ def spreadsheet(pdf_dir, filename, fill, s, jtol, mtol, invert, debug,
abs(translate(-img_y, h[3])), scaling_factor_y)
h_segments_new.append((x1, y1, x2, y2))
- num_tables = 0
+ num_tables = 1
+ output = {}
# sort tables based on y-coord
for k in sorted(tables_new.keys(), key=lambda x: x[1], reverse=True):
# find rows and columns that lie in table
@@ -91,19 +230,21 @@ def spreadsheet(pdf_dir, filename, fill, s, jtol, mtol, invert, debug,
h_s = [h for h in h_segments_new if h[0] > lb[0] - 2 and h[2]
< rt[0] + 2 and lb[1] - 2 <= h[1] <= rt[1] + 2]
- if debug == ["lines"]:
+ if debug == "line":
for v in v_s:
plt.plot([v[0], v[2]], [v[1], v[3]])
for h in h_s:
plt.plot([h[0], h[2]], [h[1], h[3]])
+ plt.show()
+ return None
columns, rows = zip(*tables_new[k])
columns, rows = list(columns), list(rows)
columns.extend([lb[0], rt[0]])
rows.extend([lb[1], rt[1]])
# sort horizontal and vertical segments
- columns = merge_close_values(sorted(columns), mtol)
- rows = merge_close_values(sorted(rows, reverse=True), mtol)
+ columns = merge_close_values(sorted(columns), mtol=mtol)
+ rows = merge_close_values(sorted(rows, reverse=True), mtol=mtol)
# make grid using x and y coord of shortlisted rows and columns
columns = [(columns[i], columns[i + 1])
for i in range(0, len(columns) - 1)]
@@ -111,13 +252,13 @@ def spreadsheet(pdf_dir, filename, fill, s, jtol, mtol, invert, debug,
table = Table(columns, rows)
# light up cell edges
- table = table.set_edges(v_s, h_s, jtol)
+ table = table.set_edges(v_s, h_s, jtol=jtol)
# table set span method
table = table.set_spanning()
- # TODO
+ # light up table border
table = outline(table)
- if debug == ["tables"]:
+ if debug == "table":
for i in range(len(table.cells)):
for j in range(len(table.cells[i])):
if table.cells[i][j].left:
@@ -132,8 +273,8 @@ def spreadsheet(pdf_dir, filename, fill, s, jtol, mtol, invert, debug,
if table.cells[i][j].bottom:
plt.plot([table.cells[i][j].lb[0], table.cells[i][j].rb[0]],
[table.cells[i][j].lb[1], table.cells[i][j].rb[1]])
- if debug:
plt.show()
+ return None
# fill text after sorting it
if not rotated:
@@ -152,26 +293,20 @@ def spreadsheet(pdf_dir, filename, fill, s, jtol, mtol, invert, debug,
r_idx, c_idx = reduce_index(table, rotated, r_idx, c_idx)
table.cells[r_idx][c_idx].add_text(t.get_text().strip('\n'))
- if fill:
- table = fill(table, fill)
+ if f is not None:
+ table = fill(table, f=f)
data = []
for i in range(len(table.cells)):
data.append([table.cells[i][j].get_text().strip().encode('utf-8')
- for j in range(len(table.cells[i]))])
+ for j in range(len(table.cells[i]))])
if rotated == 'left':
data = zip(*data[::-1])
elif rotated == 'right':
data = zip(*data[::1])
data.reverse()
data = remove_empty(data)
- csvname = filename.split(
- '.')[0] + ('_table_%d' % (num_tables + 1)) + '.csv'
- csvpath = os.path.join(pdf_dir, csvname)
- with open(csvpath, 'w') as outfile:
- writer = csv.writer(outfile, quoting=csv.QUOTE_ALL)
- for d in data:
- writer.writerow(d)
- print "saved as", csvname
- print
+ output['table_%d' % num_tables] = data
num_tables += 1
+
+ return output
\ No newline at end of file
diff --git a/morph_transform.py b/morph_transform.py
deleted file mode 100644
index 09f0c16..0000000
--- a/morph_transform.py
+++ /dev/null
@@ -1,75 +0,0 @@
-import cv2
-import numpy as np
-
-
-def morph_transform(imagename, s, invert):
- # http://answers.opencv.org/question/63847/how-to-extract-tables-from-an-image/
- img = cv2.imread(imagename)
- img_x, img_y = img.shape[1], img.shape[0]
- gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
- # empirical result taken from
- # http://pequan.lip6.fr/~bereziat/pima/2012/seuillage/sezgin04.pdf
- if invert:
- threshold = cv2.adaptiveThreshold(
- gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 15, -0.2)
- else:
- threshold = cv2.adaptiveThreshold(np.invert(
- gray), 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 15, -0.2)
- vertical = threshold
- horizontal = threshold
-
- scale = s
- verticalsize = vertical.shape[0] / scale
- horizontalsize = horizontal.shape[1] / scale
-
- ver = cv2.getStructuringElement(cv2.MORPH_RECT, (1, verticalsize))
- hor = cv2.getStructuringElement(cv2.MORPH_RECT, (horizontalsize, 1))
-
- vertical = cv2.erode(vertical, ver, (-1, -1))
- vertical = cv2.dilate(vertical, ver, (-1, -1))
-
- horizontal = cv2.erode(horizontal, hor, (-1, -1))
- horizontal = cv2.dilate(horizontal, hor, (-1, -1))
-
- mask = vertical + horizontal
- joints = np.bitwise_and(vertical, horizontal)
- _, contours, _ = cv2.findContours(
- mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
- contours = sorted(contours, key=cv2.contourArea, reverse=True)[:10]
-
- tables = {}
- for c in contours:
- c_poly = cv2.approxPolyDP(c, 3, True)
- x, y, w, h = cv2.boundingRect(c_poly)
- # find number of non-zero values in joints using what boundingRect
- # returns
- roi = joints[y:y + h, x:x + w]
- _, jc, _ = cv2.findContours(
- roi, cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE)
- if len(jc) <= 4: # remove contours with less than <=4 joints
- continue
- joint_coords = []
- for j in jc:
- jx, jy, jw, jh = cv2.boundingRect(j)
- c1, c2 = x + (2 * jx + jw) / 2, y + (2 * jy + jh) / 2
- joint_coords.append((c1, c2))
- tables[(x, y + h, x + w, y)] = joint_coords
-
- v_segments, h_segments = [], []
- _, vcontours, _ = cv2.findContours(
- vertical, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
- for vc in vcontours:
- x, y, w, h = cv2.boundingRect(vc)
- x1, x2 = x, x + w
- y1, y2 = y, y + h
- v_segments.append(((x1 + x2) / 2, y2, (x1 + x2) / 2, y1))
-
- _, hcontours, _ = cv2.findContours(
- horizontal, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
- for hc in hcontours:
- x, y, w, h = cv2.boundingRect(hc)
- x1, x2 = x, x + w
- y1, y2 = y, y + h
- h_segments.append((x1, (y1 + y2) / 2, x2, (y1 + y2) / 2))
-
- return tables, v_segments, h_segments
diff --git a/pdf.py b/pdf.py
index 136904c..b39f185 100644
--- a/pdf.py
+++ b/pdf.py
@@ -9,35 +9,86 @@ from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import LAParams, LTChar, LTTextLineHorizontal
-def parse_text_basic(layout, t=None):
+def parse_text_stream(layout, t=None):
+ """Recursively parse pdf layout to get a list of
+ LTTextHorizontal objects.
+
+ Parameters
+ ----------
+ layout : object
+
+ t : list
+
+ Returns
+ -------
+ t : list
+ """
if t is None:
t = []
try:
for obj in layout._objs:
- if type(obj) is LTTextLineHorizontal:
+ if isinstance(obj, LTTextLineHorizontal):
t.append(obj)
else:
- t += parse_text_basic(obj)
+ t += parse_text_stream(obj)
except AttributeError:
pass
return t
-def parse_text_spreadsheet(layout, t=None):
+def parse_text_lattice(layout, t=None):
+ """Recursively parse pdf layout to get a list of
+ LTChar objects.
+
+ Parameters
+ ----------
+ layout : object
+
+ t : list
+
+ Returns
+ -------
+ t : list
+ """
if t is None:
t = []
try:
for obj in layout._objs:
- if type(obj) is LTChar:
+ if isinstance(obj, LTChar):
t.append(obj)
else:
- t += parse_text_spreadsheet(obj)
+ t += parse_text_lattice(obj)
except AttributeError:
pass
return t
-def get_pdf_info(pdfname, method, char_margin, line_margin, word_margin):
+def get_pdf_info(pdfname, method=None, char_margin=2.0, line_margin=0.5,
+ word_margin=0.1):
+ """Get list of text objects along with pdf width and height.
+
+ Parameters
+ ----------
+ pdfname : string
+
+ method : string
+
+ char_margin : float
+
+ line_margin : float
+
+ word_margin : float
+
+ Returns
+ -------
+ text : list
+
+ pdf_x : int
+
+ pdf_y : int
+ """
+ if not method:
+ return None
with open(pdfname, 'r') as f:
parser = PDFParser(f)
document = PDFDocument(parser)
@@ -52,9 +103,9 @@ def get_pdf_info(pdfname, method, char_margin, line_margin, word_margin):
for page in PDFPage.create_pages(document):
interpreter.process_page(page)
layout = device.get_result()
- if method == 'basic':
- text = parse_text_basic(layout)
- elif method == 'spreadsheet':
- text = parse_text_spreadsheet(layout)
+ if method == 'stream':
+ text = parse_text_stream(layout)
+ elif method == 'lattice':
+ text = parse_text_lattice(layout)
pdf_x, pdf_y = layout.bbox[2], layout.bbox[3]
return text, pdf_x, pdf_y
diff --git a/stream.py b/stream.py
new file mode 100644
index 0000000..13cb8d8
--- /dev/null
+++ b/stream.py
@@ -0,0 +1,143 @@
+import os
+import numpy as np
+
+from pdf import get_pdf_info
+
+
+def overlap(l):
+ """Groups overlapping columns and returns list with updated
+ columns boundaries.
+
+ Parameters
+ ----------
+ l : list
+ List of column x-coordinates.
+
+ Returns
+ -------
+ merged : list
+ List of merged column x-coordinates.
+ """
+ merged = []
+ for higher in l:
+ if not merged:
+ merged.append(higher)
+ else:
+ lower = merged[-1]
+ if higher[0] <= lower[1]:
+ upper_bound = max(lower[1], higher[1])
+ lower_bound = min(lower[0], higher[0])
+ merged[-1] = (lower_bound, upper_bound)
+ else:
+ merged.append(higher)
+ return merged
+
+
+def stream(filepath, ncolumns=0, columns=None, char_margin=2.0,
+ line_margin=0.5, word_margin=0.1, debug=False):
+ """Stream algorithm
+
+ Groups data returned by PDFMiner into rows and finds mode of the
+ number of elements in each row to guess number of columns.
+
+ Parameters
+ ----------
+ filepath : string
+
+ ncolumns : int, default: 0, optional
+ Number of columns.
+
+ columns : string, default: None, optional
+ Comma-separated list of column x-coordinates.
+
+ char_margin : float, default: 2.0, optional
+ Char margin. Chars closer than cmargin are grouped together
+ to form a word.
+
+ line_margin : float, default: 0.5, optional
+ Line margin. Lines closer than lmargin are grouped together
+ to form a textbox.
+
+ word_margin : float, default: 0.1, optional
+ Word margin. Insert blank spaces between chars if distance
+ between words is greater than word margin.
+
+ debug : bool, default: False, optional
+ Debug by visualizing textboxes.
+
+ Returns
+ -------
+ output : list
+ """
+ filename = os.path.basename(filepath)
+ print "working on", filename
+ text, __, __ = get_pdf_info(filepath, method='stream', char_margin=char_margin,
+ line_margin=line_margin, word_margin=word_margin)
+ text.sort(key=lambda x: (-x.y0, x.x0))
+ y_last = 0
+ data = []
+ temp = []
+ elements = []
+ for t in text:
+ # is checking for upright necessary?
+ # if t.get_text().strip() and all([obj.upright for obj in t._objs if
+ # type(obj) is LTChar]):
+ if t.get_text().strip():
+ if not np.isclose(y_last, t.y0, atol=2):
+ y_last = t.y0
+ elements.append(len(temp))
+ data.append(temp)
+ temp = []
+ temp.append(t)
+
+ if debug:
+ import matplotlib.pyplot as plt
+ import matplotlib.patches as patches
+
+ fig = plt.figure()
+ ax = fig.add_subplot(111, aspect='equal')
+ xs, ys = [], []
+ for d in data:
+ for t in d:
+ xs.extend([t.x0, t.x1])
+ ys.extend([t.y0, t.y1])
+ ax.add_patch(
+ patches.Rectangle(
+ (t.x0, t.y0),
+ t.x1 - t.x0,
+ t.y1 - t.y0
+ )
+ )
+ ax.set_xlim(min(xs) - 10, max(xs) + 10)
+ ax.set_ylim(min(ys) - 10, max(ys) + 10)
+ plt.show()
+ return None
+
+ if columns:
+ cols = [(float(columns[i]), float(columns[i + 1]))
+ for i in range(0, len(columns) - 1)]
+ cols = [(c[0] + c[1]) / 2.0 for c in cols]
+ else:
+ # a table can't have just 1 column, can it?
+ elements = filter(lambda x: x != 1, elements)
+ mode = ncolumns if ncolumns else max(set(elements), key=elements.count)
+ cols = [(t.x0, t.x1) for d in data for t in d if len(d) == mode]
+ cols = overlap(sorted(cols))
+ cols = [(c[0] + c[1]) / 2.0 for c in cols]
+
+ output = [['' for c in cols] for d in data]
+ for row, d in enumerate(data):
+ for t in d:
+ cog = (t.x0 + t.x1) / 2.0
+ diff = [(i, abs(cog - c)) for i, c in enumerate(cols)]
+ if diff:
+ idx = min(diff, key=lambda x: x[1])
+ else:
+ print "couldn't find a table on this page"
+ return None
+ if output[row][idx[0]]:
+ output[row][idx[0]] += ' ' + t.get_text().strip()
+ else:
+ output[row][idx[0]] = t.get_text().strip()
+
+ return output
\ No newline at end of file
diff --git a/table.py b/table.py
index 3e4e338..d38279c 100644
--- a/table.py
+++ b/table.py
@@ -4,14 +4,55 @@ from cell import Cell
class Table:
+ """Table
+
+ Parameters
+ ----------
+ columns : list
+ List of column x-coordinates.
+ rows : list
+ List of row y-coordinates.
+
+ Attributes
+ ----------
+ cells : list
+ 2-D list of cell objects.
+
+ columns : list
+ List of column x-coordinates.
+
+ rows : list
+ List of row y-coordinates.
+ """
def __init__(self, columns, rows):
+
self.cells = [[Cell(c[0], r[1], c[1], r[0])
for c in columns] for r in rows]
self.columns = columns
self.rows = rows
- def set_edges(self, vertical, horizontal, jtol):
+ def set_edges(self, vertical, horizontal, jtol=2):
+ """Set cell edges to True if corresponding line segments
+ are detected in the pdf image.
+
+ Parameters
+ ----------
+ vertical : list
+ List of vertical line segments.
+
+ horizontal : list
+ List of horizontal line segments.
+
+ jtol : int, default: 2, optional
+ Tolerance to account for when comparing joint and line
+ coordinates.
+
+ Returns
+ -------
+ self : object
+ Returns self.
+ """
for v in vertical:
# find closest x coord
# iterate over y coords and find closest points
@@ -117,6 +158,14 @@ class Table:
return self
def set_spanning(self):
+ """Set spanning values of a cell to True if it isn't
+ bounded by four edges.
+
+ Returns
+ -------
+ self : object
+ Returns self.
+ """
for i in range(len(self.cells)):
for j in range(len(self.cells[i])):
bound = self.cells[i][j].get_bounded_edges()
@@ -125,28 +174,38 @@ class Table:
elif bound == 3:
if not self.cells[i][j].left:
- if self.cells[i][j].right and self.cells[i][j].top and self.cells[i][j].bottom:
+ if (self.cells[i][j].right and
+ self.cells[i][j].top and
+ self.cells[i][j].bottom):
self.cells[i][j].spanning_h = True
elif not self.cells[i][j].right:
- if self.cells[i][j].left and self.cells[i][j].top and self.cells[i][j].bottom:
+ if (self.cells[i][j].left and
+ self.cells[i][j].top and
+ self.cells[i][j].bottom):
self.cells[i][j].spanning_h = True
elif not self.cells[i][j].top:
- if self.cells[i][j].left and self.cells[i][j].right and self.cells[i][j].bottom:
+ if (self.cells[i][j].left and
+ self.cells[i][j].right and
+ self.cells[i][j].bottom):
self.cells[i][j].spanning_v = True
elif not self.cells[i][j].bottom:
- if self.cells[i][j].left and self.cells[i][j].right and self.cells[i][j].top:
+ if (self.cells[i][j].left and
+ self.cells[i][j].right and
+ self.cells[i][j].top):
self.cells[i][j].spanning_v = True
elif bound == 2:
if self.cells[i][j].left and self.cells[i][j].right:
- if not self.cells[i][j].top and not self.cells[i][j].bottom:
+ if (not self.cells[i][j].top and
+ not self.cells[i][j].bottom):
self.cells[i][j].spanning_v = True
elif self.cells[i][j].top and self.cells[i][j].bottom:
- if not self.cells[i][j].left and not self.cells[i][j].right:
+ if (not self.cells[i][j].left and
+ not self.cells[i][j].right):
self.cells[i][j].spanning_h = True
return self
diff --git a/utils.py b/utils.py
index 46b62cc..89d037c 100644
--- a/utils.py
+++ b/utils.py
@@ -2,16 +2,61 @@ import numpy as np
def translate(x1, x2):
+ """Translate coordinate x2 by x1.
+
+ Parameters
+ ----------
+ x1 : float
+
+ x2 : float
+
+ Returns
+ -------
+ x2 : float
+ """
x2 += x1
return x2
def scale(x, s):
+ """Scale coordinate x by scaling factor s.
+
+ Parameters
+ ----------
+ x : float
+
+ s : float
+
+ Returns
+ -------
+ x : float
+ """
x *= s
return x
def rotate(x1, y1, x2, y2, angle):
+ """Rotate point x2, y2 about point x1, y1 by angle.
+
+ Parameters
+ ----------
+ x1 : float
+
+ y1 : float
+
+ x2 : float
+
+ y2 : float
+
+ angle : float
+ Angle in radians.
+
+ Returns
+ -------
+ xnew : float
+
+ ynew : float
+ """
s = np.sin(angle)
c = np.cos(angle)
x2 = translate(-x1, x2)
@@ -23,7 +68,20 @@ def rotate(x1, y1, x2, y2, angle):
return xnew, ynew
-def remove_close_values(ar, mtol):
+def remove_close_values(ar, mtol=2):
+ """Remove values which are within a tolerance of mtol of another value
+ present in list.
+
+ Parameters
+ ----------
+ ar : list
+
+ mtol : int, default: 2, optional
+
+ Returns
+ -------
+ ret : list
+ """
ret = []
for a in ar:
if not ret:
@@ -37,7 +95,20 @@ def remove_close_values(ar, mtol):
return ret
-def merge_close_values(ar, mtol):
+def merge_close_values(ar, mtol=2):
+ """Merge values which are within a tolerance of mtol by calculating
+ a moving mean.
+
+ Parameters
+ ----------
+ ar : list
+
+ mtol : int, default: 2, optional
+
+ Returns
+ -------
+ ret : list
+ """
ret = []
for a in ar:
if not ret:
@@ -53,18 +124,63 @@ def merge_close_values(ar, mtol):
def get_row_idx(t, rows):
+ """Get index of the row in which the given object falls by
+ comparing their co-ordinates.
+
+ Parameters
+ ----------
+ t : object
+
+ rows : list
+
+ Returns
+ -------
+ r : int
+ """
for r in range(len(rows)):
if (t.y0 + t.y1) / 2.0 < rows[r][0] and (t.y0 + t.y1) / 2.0 > rows[r][1]:
return r
def get_column_idx(t, columns):
+ """Get index of the column in which the given object falls by
+ comparing their co-ordinates.
+
+ Parameters
+ ----------
+ t : object
+
+ columns : list
+
+ Returns
+ -------
+ c : int
+ """
for c in range(len(columns)):
if (t.x0 + t.x1) / 2.0 > columns[c][0] and (t.x0 + t.x1) / 2.0 < columns[c][1]:
return c
def reduce_index(t, rotated, r_idx, c_idx):
+ """Shift a text object if it lies within a spanning cell taking
+ in account table rotation.
+
+ Parameters
+ ----------
+ t : object
+
+ rotated : string
+
+ r_idx : int
+
+ c_idx : int
+
+ Returns
+ -------
+ r_idx : int
+
+ c_idx : int
+ """
if not rotated:
if t.cells[r_idx][c_idx].spanning_h:
while not t.cells[r_idx][c_idx].left:
@@ -90,6 +206,16 @@ def reduce_index(t, rotated, r_idx, c_idx):
def outline(t):
+ """Light up table boundary.
+
+ Parameters
+ ----------
+ t : object
+
+ Returns
+ -------
+ t : object
+ """
for i in range(len(t.cells)):
t.cells[i][0].left = True
t.cells[i][len(t.cells[i]) - 1].right = True
@@ -99,7 +225,19 @@ def outline(t):
return t
-def fill(t, f):
+def fill(t, f=None):
+ """Fill spanning cells.
+
+ Parameters
+ ----------
+ t : object
+
+ f : string, default: None, optional
+
+ Returns
+ -------
+ t : object
+ """
if f == "h":
for i in range(len(t.cells)):
for j in range(len(t.cells[i])):
@@ -124,6 +262,16 @@ def fill(t, f):
def remove_empty(d):
+ """Remove empty rows and columns.
+
+ Parameters
+ ----------
+ d : list
+
+ Returns
+ -------
+ d : list
+ """
for i, row in enumerate(d):
if row == [''] * len(row):
d.pop(i)