Make code PEP8 compliant

2016-07-11 15:19:38 +05:30
parent f6869a9af4
commit b87d2350dc
9 changed files with 765 additions and 489 deletions
@@ -1,30 +1,70 @@
 Camelot
 -------

-usage: python2 camelot.py [options] pdf_file
+Description: Parse tables from pdfs!

-Parse yo pdf!
+Dependencies
+
+Install
+
+Usage: python2 camelot.py [options] file

 positional arguments:
+
  file

 optional arguments:
-  -h, --help            show this help message and exit

-  -p PAGES [PAGES ...]  Specify the page numbers and/or page ranges to be
-                        parsed. Example: -p="1 3-5 9", -p="all" (default:
-                        -p="1")
+  -h, --help

-  -f FORMAT             Output format (csv/xlsx). Example: -f="xlsx" (default:
-                        -f="csv")
+    show this help message and exit

-  -spreadsheet          Extract data stored in pdfs with ruling lines.
-                        (default: False)
+  -p, --pages PAGES [PAGES ...]

-  -F ORIENTATION        Fill the values in empty cells. Example: -F="h",
-                        -F="v", -F="hv" (default: None)
+    Specify the page numbers and/or page ranges to be
+    parsed. Example: -p="1 3-5 9", -p="all" (default: 1)

-  -s [SCALE]            Scaling factor. Large scaling factor leads to smaller
-                        lines being detected. (default: 15)
+  -f, --format FORMAT

-Under construction...
+    Output format (csv/xlsx). Example: -f="xlsx" (default: csv)
+
+  -m, --spreadsheet
+
+    Extract tables with ruling lines. (default: False)
+
+  -F, --fill FILL
+
+    Fill the values in empty cells horizontally(h) and/or
+    vertically(v). Example: -F="h", -F="v", -F="hv" (default: None)
+
+  -s, --scale [SCALE]
+
+    Scaling factor. Large scaling factor leads to smaller
+    lines being detected. (default: 15)
+
+  -j, --jtol [JTOL]
+
+    Tolerance to account for when comparing joint and line
+    coordinates. (default: 2)
+
+  -M, --mtol [MTOL]
+
+    Tolerance to account for when merging lines which are
+    very close. (default: 2)
+
+  -i, --invert
+
+    Make sure lines are in foreground. (default: False)
+
+  -d, --debug DEBUG
+
+    Debug by visualizing contours, lines, joints, tables.
+    Example: --debug="contours"
+
+  -o, --output OUTPUT
+
+    Specify output directory.
+
+Development: Code, Contributing, Tests
+
+License
@@ -4,55 +4,76 @@ import numpy as np

 from pdf import get_pdf_info

+
 def overlap(l):
-	merged = []
-	for higher in l:
-		if not merged:
-			merged.append(higher)
-		else:
-			lower = merged[-1]
-			if higher[0] >= lower[0] and higher[1] <= lower[1]:
-				upper_bound = max(lower[1], higher[1])
-				lower_bound = min(lower[0], higher[0])
-				merged[-1] = (lower_bound, upper_bound)
-			else:
-				merged.append(higher)
-	return merged
+    merged = []
+    for higher in l:
+        if not merged:
+            merged.append(higher)
+        else:
+            lower = merged[-1]
+            if higher[0] <= lower[1]:
+                upper_bound = max(lower[1], higher[1])
+                lower_bound = min(lower[0], higher[0])
+                merged[-1] = (lower_bound, upper_bound)
+            else:
+                merged.append(higher)
+    return merged
+

 def get_row_idx(t, rows):
-	for r in range(len(rows)):
-		if t.y1 <= rows[r][0] and t.y0 >= rows[r][1]:
-			return r
+    for r in range(len(rows)):
+        if t.y1 <= rows[r][0] and t.y0 >= rows[r][1]:
+            return r
+

 def get_column_idx(t, columns):
-	for c in range(len(columns)):
-		if t.x0 >= columns[c][0] and t.x1 <= columns[c][1]:
-			return c
+    for c in range(len(columns)):
+        if t.x0 >= columns[c][0] and t.x1 <= columns[c][1]:
+            return c
+

 def basic(pdf_dir, filename):
-	print "working on", filename
-	text, _, _ = get_pdf_info(os.path.join(pdf_dir, filename), 'basic')
-	rows, columns = [], []
-	for t in text:
-		rows.append((t.y1, t.y0))
-		columns.append((t.x0, t.x1))
-	rows = list(set(rows))
-	rows = sorted(rows, reverse=True)
-	columns = list(set(columns))
-	columns = sorted(columns)
-	columns = overlap(columns)
-	table = [['' for c in columns] for r in rows]
-	for t in text:
-		r_idx = get_row_idx(t, rows)
-		c_idx = get_column_idx(t, columns)
-		if None in [r_idx, c_idx]:
-			print t
-		else:
-			table[r_idx][c_idx] = t.get_text().strip('\n')
+    print "working on", filename
+    text, _, _ = get_pdf_info(os.path.join(pdf_dir, filename), 'basic')
+    text.sort(key=lambda x: (-x.y0, x.x0))
+    y_last = 0
+    data = []
+    temp = []
+    elements = []
+    for t in text:
+        # is checking for upright necessary?
+        # if t.get_text().strip() and all([obj.upright for obj in t._objs if
+        # type(obj) is LTChar]):
+        if t.get_text().strip():
+            if not np.isclose(y_last, t.y0, atol=2):
+                y_last = t.y0
+                elements.append(len(temp))
+                data.append(temp)
+                temp = []
+            temp.append(t)
+    # a table can't have just 1 column, can it?
+    elements = filter(lambda x: x != 1, elements)
+    # mode = int(sys.argv[2]) if sys.argv[2] else max(set(elements), key=elements.count)
+    mode = max(set(elements), key=elements.count)
+    columns = [(t.x0, t.x1) for d in data for t in d if len(d) == mode]
+    columns = overlap(sorted(columns))
+    columns = [(c[0] + c[1]) / 2.0 for c in columns]

-	csvname = filename.split('.')[0] + '.csv'
-	csvpath = os.path.join(pdf_dir, csvname)
-	with open(csvpath, 'w') as outfile:
-		writer = csv.writer(outfile, quoting=csv.QUOTE_ALL)
-	 	for cell in table:
-			writer.writerow([ce for ce in cell])
+    output = [['' for c in columns] for d in data]
+    for row, d in enumerate(data):
+        for t in d:
+            cog = (t.x0 + t.x1) / 2.0
+            diff = [(i, abs(cog - c)) for i, c in enumerate(columns)]
+            idx = min(diff, key=lambda x: x[1])
+            if output[row][idx[0]]:
+                output[row][idx[0]] += ' ' + t.get_text().strip()
+            else:
+                output[row][idx[0]] = t.get_text().strip()
+
+    csvname = filename.split('.')[0] + '.csv'
+    csvpath = os.path.join(pdf_dir, csvname)
+    with open(csvpath, 'w') as outfile:
+        writer = csv.writer(outfile, quoting=csv.QUOTE_ALL)
+        for row in output:
+            writer.writerow([cell.encode('utf-8') for cell in row])
@@ -12,90 +12,118 @@ from spreadsheet import spreadsheet

 pno = re.compile(r'\d+')

+
 def mkdir(directory):
    if not os.path.isdir(directory):
        os.makedirs(directory)

+
 def filesort(filename):
-	filename = filename.split('/')[-1]
-	num = pno.findall(filename)
-	if len(num) == 2:
-		return (int(num[0]), int(num[1]))
-	else:
-		return (int(num[0]), 0)
+    filename = filename.split('/')[-1]
+    num = pno.findall(filename)
+    if len(num) == 2:
+        return (int(num[0]), int(num[1]))
+    else:
+        return (int(num[0]), 0)

 start_time = time.time()
 CAMELOT_DIR = '.camelot/'
 mkdir(CAMELOT_DIR)

-parser = argparse.ArgumentParser(description='Parse yo pdf!', usage='python2 camelot.py [options] pdf_file')
-parser.add_argument('-p', nargs='+', action='store', dest='pages', help='Specify the page numbers and/or page ranges to be parsed. Example: -p="1 3-5 9", -p="all" (default: -p="1")')
-parser.add_argument('-f', nargs=1, action='store', dest='format', help='Output format (csv/xlsx). Example: -f="xlsx" (default: -f="csv")', default=["csv"])
-parser.add_argument('-spreadsheet', action='store_true', dest='spreadsheet', help='Extract data stored in pdfs with ruling lines. (default: False)')
-parser.add_argument('-F', action='store', dest='orientation', help='Fill the values in empty cells. Example: -F="h", -F="v", -F="hv" (default: None)', default=None)
-parser.add_argument('-s', nargs='?', action='store', dest='scale', help='Scaling factor. Large scaling factor leads to smaller lines being detected. (default: 15)', default=15, type=int)
+parser = argparse.ArgumentParser(
+    description='Parse tables from pdfs!', usage='python2 camelot.py [options] file')
+parser.add_argument('-p', '--pages', nargs='+', action='store', dest='pages',
+                    help='Specify the page numbers and/or page ranges to be parsed. Example: -p="1 3-5 9", -p="all" (default: 1)')
+parser.add_argument('-f', '--format', nargs=1, action='store', dest='format',
+                    help='Output format (csv/xlsx). Example: -f="xlsx" (default: csv)', default=["csv"])
+parser.add_argument('-m', '--spreadsheet', action='store_true', dest='spreadsheet',
+                    help='Extract tables with ruling lines. (default: False)')
+parser.add_argument('-F', '--fill', action='store', dest='fill',
+                    help='Fill the values in empty cells horizontally(h) and/or vertically(v). Example: -F="h", -F="v", -F="hv" (default: None)', default=None)
+parser.add_argument('-s', '--scale', nargs='?', action='store', dest='scale',
+                    help='Scaling factor. Large scaling factor leads to smaller lines being detected. (default: 15)', default=15, type=int)
+parser.add_argument('-j', '--jtol', nargs='?', action='store',
+                    dest='jtol', help='Tolerance to account for when comparing joint and line coordinates. (default: 2)', default=2, type=int)
+parser.add_argument('-M', '--mtol', nargs='?', action='store',
+                    dest='mtol', help='Tolerance to account for when merging lines which are very close. (default: 2)', default=2, type=int)
+parser.add_argument('-i', '--invert', action='store_true', dest='invert',
+                    help='Make sure lines are in foreground. (default: False)')
+parser.add_argument('-d', '--debug', nargs=1, action='store', dest='debug',
+                    help='Debug by visualizing contours, lines, joints, tables. Example: --debug="contours"')
+parser.add_argument('-o', '--output', nargs=1, action='store', dest='output',
+                    help='Specify output directory.')
 parser.add_argument('file', nargs=1)

 result = parser.parse_args()

 if result.pages:
-	if result.pages == ['all']:
-		p = result.pages
-	else:
-		p = []
-		for r in result.pages[0].split(' '):
-			if '-' in r:
-				a, b = r.split('-')
-				a, b = int(a), int(b)
-				p.extend([str(i) for i in range(a, b + 1)])
-			else:
-				p.extend([str(r)])
+    if result.pages == ['all']:
+        p = result.pages
+    else:
+        p = []
+        for r in result.pages[0].split(' '):
+            if '-' in r:
+                a, b = r.split('-')
+                a, b = int(a), int(b)
+                p.extend([str(i) for i in range(a, b + 1)])
+            else:
+                p.extend([str(r)])
 else:
-	p = ['1']
+    p = ['1']
 p = sorted(set(p))

-s = result.spreadsheet
-
-pdf_dir = os.path.join(CAMELOT_DIR, os.urandom(16).encode('hex'))
-mkdir(pdf_dir)
 filename = result.file[0].split('/')[-1]
-logging.basicConfig(filename=os.path.join(pdf_dir, filename.split('.')[0] + '.log'), filemode='w', level=logging.DEBUG)
+# pdf_dir = os.path.join(CAMELOT_DIR, os.urandom(16).encode('hex'))
+pdf_dir = os.path.join(CAMELOT_DIR, filename.split('.')[0])
+mkdir(pdf_dir)
+logging.basicConfig(filename=os.path.join(pdf_dir, filename.split('.')[
+                    0] + '.log'), filemode='w', level=logging.DEBUG)

 shutil.copy(result.file[0], os.path.join(pdf_dir, filename))
 print "separating pdf into pages"
 print
 if p == ['all']:
-	subprocess.call(['pdfseparate', os.path.join(pdf_dir, filename), os.path.join(pdf_dir, 'pg-%d.pdf')])
+    subprocess.call(['pdfseparate', os.path.join(
+        pdf_dir, filename), os.path.join(pdf_dir, 'pg-%d.pdf')])
 else:
-	for page in p:
-		subprocess.call(['pdfseparate', '-f', page, '-l', page, os.path.join(pdf_dir, filename), os.path.join(pdf_dir, 'pg-' + page + '.pdf')])
+    for page in p:
+        subprocess.call(['pdfseparate', '-f', page, '-l', page, os.path.join(
+            pdf_dir, filename), os.path.join(pdf_dir, 'pg-' + page + '.pdf')])

-if s:
-	print "using the spreadsheet method"
-	for g in sorted(glob.glob(os.path.join(pdf_dir, 'pg-*.pdf'))):
-		print "converting", g.split('/')[-1], "to image"
-		os.system(' '.join(['convert', '-density', '300', g, '-depth', '8', g[:-4] + '.png']))
-		spreadsheet(pdf_dir, g.split('/')[-1], result.orientation, result.scale)
+if result.spreadsheet:
+    print "using the spreadsheet method"
+    for g in sorted(glob.glob(os.path.join(pdf_dir, 'pg-*.pdf'))):
+        print "converting", g.split('/')[-1], "to image"
+        os.system(' '.join(['convert', '-density', '300',
+                            g, '-depth', '8', g[:-4] + '.png']))
+        try:
+            spreadsheet(pdf_dir, g.split('/')[-1], result.fill, result.scale,
+                        result.jtol, result.mtol, result.invert, result.debug)
+        except:
+          logging.error("Couldn't parse " + g.split('/')[-1])
+          print "Couldn't parse", g.split('/')[-1]
 else:
-	print "using the basic method"
-	for g in sorted(glob.glob(os.path.join(pdf_dir, 'pg-*.pdf'))):
-		basic(pdf_dir, g.split('/')[-1])
+    print "using the basic method"
+    for g in sorted(glob.glob(os.path.join(pdf_dir, 'pg-*.pdf'))):
+        basic(pdf_dir, g.split('/')[-1])

 if result.format == ['xlsx']:
-	import csv
-	from pyexcel_xlsx import save_data
-	from collections import OrderedDict
-	data = OrderedDict()
-	for c in sorted(glob.glob(os.path.join(pdf_dir, '*.csv')), key=filesort):
-		print "adding", c.split('/')[-1], "to excel file"
-		with open(c, 'r') as csvfile:
-			reader = csv.reader(csvfile)
-			data.update({c.split('/')[-1].split('.')[0]: [row for row in reader]})
-	xlsxname = filename.split('.')[0] + '.xlsx'
-	xlsxpath = os.path.join(pdf_dir, xlsxname)
-	save_data(xlsxpath, data)
-	print
-	print "saved as", xlsxname
+    import csv
+    from pyexcel_xlsx import save_data
+    from collections import OrderedDict
+    data = OrderedDict()
+    for c in sorted(glob.glob(os.path.join(pdf_dir, '*.csv')), key=filesort):
+        print "adding", c.split('/')[-1], "to excel file"
+        with open(c, 'r') as csvfile:
+            reader = csv.reader(csvfile)
+            data.update({c.split('/')[-1].split('.')
+                         [0]: [row for row in reader]})
+    xlsxname = filename.split('.')[0] + '.xlsx'
+    xlsxpath = os.path.join(pdf_dir, xlsxname)
+    save_data(xlsxpath, data)
+    print
+    print "saved as", xlsxname

 print "finished in", time.time() - start_time, "seconds"
-logging.info("Time taken for " + filename + ": " + str(time.time() - start_time) + " seconds")
+logging.info("Time taken for " + filename + ": " +
+             str(time.time() - start_time) + " seconds")
@@ -1,23 +1,24 @@
 class Cell:
-	def __init__(self, x1, y1, x2, y2):
-		self.lb = (x1, y1)
-		self.lt = (x1, y2)
-		self.rb = (x2, y1)
-		self.rt = (x2, y2)
-		self.bbox = (x1, y1, x2, y2)
-		self.left = False
-		self.right = False
-		self.top = False
-		self.bottom = False
-		self.text = ''
-		self.spanning_h = False
-		self.spanning_v = False

-	def add_text(self, text):
-		self.text += text
-	
-	def get_text(self):
-		return self.text
+    def __init__(self, x1, y1, x2, y2):
+        self.lb = (x1, y1)
+        self.lt = (x1, y2)
+        self.rb = (x2, y1)
+        self.rt = (x2, y2)
+        self.bbox = (x1, y1, x2, y2)
+        self.left = False
+        self.right = False
+        self.top = False
+        self.bottom = False
+        self.text = ''
+        self.spanning_h = False
+        self.spanning_v = False

-	def get_bounded_edges(self):
-		return self.top + self.bottom + self.left + self.right
+    def add_text(self, text):
+        self.text += text
+
+    def get_text(self):
+        return self.text
+
+    def get_bounded_edges(self):
+        return self.top + self.bottom + self.left + self.right
@@ -1,73 +1,75 @@
 import cv2
 import numpy as np

-def transform(x, y, img_x, img_y, pdf_x, pdf_y):
-	x *= pdf_x / float(img_x)
-	y = abs(y - img_y)
-	y *= pdf_y / float(img_y)
-	return x, y

-# http://answers.opencv.org/question/63847/how-to-extract-tables-from-an-image/
-def morph(imagename, p_x, p_y, s):
-	img = cv2.imread(imagename)
-	img_x, img_y = img.shape[1], img.shape[0]
-	pdf_x, pdf_y = p_x, p_y
-	gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
-	# empirical result taken from http://pequan.lip6.fr/~bereziat/pima/2012/seuillage/sezgin04.pdf
-	threshold = cv2.adaptiveThreshold(np.invert(gray), 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 15, -0.2)
-	vertical = threshold
-	horizontal = threshold
+def morph_transform(imagename, s, invert):
+    # http://answers.opencv.org/question/63847/how-to-extract-tables-from-an-image/
+    img = cv2.imread(imagename)
+    img_x, img_y = img.shape[1], img.shape[0]
+    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
+    # empirical result taken from
+    # http://pequan.lip6.fr/~bereziat/pima/2012/seuillage/sezgin04.pdf
+    if invert:
+        threshold = cv2.adaptiveThreshold(
+            gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 15, -0.2)
+    else:
+        threshold = cv2.adaptiveThreshold(np.invert(
+            gray), 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 15, -0.2)
+    vertical = threshold
+    horizontal = threshold

-	scale = s
-	verticalsize = vertical.shape[0] / scale
-	horizontalsize = horizontal.shape[1] / scale
+    scale = s
+    verticalsize = vertical.shape[0] / scale
+    horizontalsize = horizontal.shape[1] / scale

-	ver = cv2.getStructuringElement(cv2.MORPH_RECT, (1, verticalsize))
-	hor = cv2.getStructuringElement(cv2.MORPH_RECT, (horizontalsize, 1))
+    ver = cv2.getStructuringElement(cv2.MORPH_RECT, (1, verticalsize))
+    hor = cv2.getStructuringElement(cv2.MORPH_RECT, (horizontalsize, 1))

-	vertical = cv2.erode(vertical, ver, (-1, -1))
-	vertical = cv2.dilate(vertical, ver, (-1, -1))
+    vertical = cv2.erode(vertical, ver, (-1, -1))
+    vertical = cv2.dilate(vertical, ver, (-1, -1))

-	horizontal = cv2.erode(horizontal, hor, (-1, -1))
-	horizontal = cv2.dilate(horizontal, hor, (-1, -1))
+    horizontal = cv2.erode(horizontal, hor, (-1, -1))
+    horizontal = cv2.dilate(horizontal, hor, (-1, -1))

-	mask = vertical + horizontal
-	joints = np.bitwise_and(vertical, horizontal)
-	_, contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
-	contours = sorted(contours, key=cv2.contourArea, reverse=True)[:10]
+    mask = vertical + horizontal
+    joints = np.bitwise_and(vertical, horizontal)
+    _, contours, _ = cv2.findContours(
+        mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+    contours = sorted(contours, key=cv2.contourArea, reverse=True)[:10]

-	tables = {}
-	for c in contours:
-		c_poly = cv2.approxPolyDP(c, 3, True)
-		x, y, w, h = cv2.boundingRect(c_poly)
-		# find number of non-zero values in joints using what boundingRect returns
-		roi = joints[y:y+h, x:x+w]
-		_, jc, _ = cv2.findContours(roi, cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE)
-		if len(jc) <= 4: # remove contours with less than <=4 joints
-			continue
-		joint_coords = []
-		for j in jc:
-			jx, jy, jw, jh = cv2.boundingRect(j)
-			c1, c2 = x + (2*jx + jw) / 2, y + (2*jy + jh) / 2
-			c1, c2 = transform(c1, c2, img_x, img_y, pdf_x, pdf_y)
-			joint_coords.append((c1, c2))
-		x1, y1 = transform(x, y, img_x, img_y, pdf_x, pdf_y)
-		x2, y2 = transform(x + w, y + h, img_x, img_y, pdf_x, pdf_y)
-		tables[(x1, y2, x2, y1)] = joint_coords
+    tables = {}
+    for c in contours:
+        c_poly = cv2.approxPolyDP(c, 3, True)
+        x, y, w, h = cv2.boundingRect(c_poly)
+        # find number of non-zero values in joints using what boundingRect
+        # returns
+        roi = joints[y:y + h, x:x + w]
+        _, jc, _ = cv2.findContours(
+            roi, cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE)
+        if len(jc) <= 4:  # remove contours with less than <=4 joints
+            continue
+        joint_coords = []
+        for j in jc:
+            jx, jy, jw, jh = cv2.boundingRect(j)
+            c1, c2 = x + (2 * jx + jw) / 2, y + (2 * jy + jh) / 2
+            joint_coords.append((c1, c2))
+        tables[(x, y + h, x + w, y)] = joint_coords

-	v_segments, h_segments = [], []
-	_, vcontours, _ = cv2.findContours(vertical, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
-	for vc in vcontours:
-		x, y, w, h = cv2.boundingRect(vc)
-		x1, y1 = transform(x, y, img_x, img_y, pdf_x, pdf_y)
-		x2, y2 = transform(x + w, y + h, img_x, img_y, pdf_x, pdf_y)
-		v_segments.append(((x1 + x2) / 2, y2, (x1 + x2) / 2, y1))
-		
-	_, hcontours, _ = cv2.findContours(horizontal, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
-	for hc in hcontours:
-		x, y, w, h = cv2.boundingRect(hc)
-		x1, y1 = transform(x, y, img_x, img_y, pdf_x, pdf_y)
-		x2, y2 = transform(x + w, y + h, img_x, img_y, pdf_x, pdf_y)
-		h_segments.append((x1, (y1 + y2) / 2, x2, (y1 + y2) / 2))
+    v_segments, h_segments = [], []
+    _, vcontours, _ = cv2.findContours(
+        vertical, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+    for vc in vcontours:
+        x, y, w, h = cv2.boundingRect(vc)
+        x1, x2 = x, x + w
+        y1, y2 = y, y + h
+        v_segments.append(((x1 + x2) / 2, y2, (x1 + x2) / 2, y1))

-	return tables, v_segments, h_segments
+    _, hcontours, _ = cv2.findContours(
+        horizontal, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+    for hc in hcontours:
+        x, y, w, h = cv2.boundingRect(hc)
+        x1, x2 = x, x + w
+        y1, y2 = y, y + h
+        h_segments.append((x1, (y1 + y2) / 2, x2, (y1 + y2) / 2))
+
+    return tables, v_segments, h_segments
@@ -8,47 +8,51 @@ from pdfminer.pdfdevice import PDFDevice
 from pdfminer.converter import PDFPageAggregator
 from pdfminer.layout import LAParams, LTChar, LTTextLineHorizontal

-text = []

-def parse_text_basic(layout):
-	global text
-	try:
-		for obj in layout._objs:
-			if type(obj) is LTTextLineHorizontal:
-				text.append(obj)
-			parse_text_basic(obj)
-	except AttributeError:
-		pass
+def parse_text_basic(layout, t=None):
+    if t is None:
+        t = []
+    try:
+        for obj in layout._objs:
+            if type(obj) is LTTextLineHorizontal:
+                t.append(obj)
+            else:
+                t += parse_text_basic(obj)
+    except AttributeError:
+        pass
+    return t
+
+
+def parse_text_spreadsheet(layout, t=None):
+    if t is None:
+        t = []
+    try:
+        for obj in layout._objs:
+            if type(obj) is LTChar:
+                t.append(obj)
+            else:
+                t += parse_text_spreadsheet(obj)
+    except AttributeError:
+        pass
+    return t

-def parse_text_spreadsheet(layout):
-	global text
-	try:
-		for obj in layout._objs:
-			if type(obj) is LTChar:
-				text.append(obj)
-			parse_text_spreadsheet(obj)
-	except AttributeError:
-		pass

 def get_pdf_info(pdfname, method):
-	global text
-	with open(pdfname, 'r') as f:
-		parser = PDFParser(f)
-		document = PDFDocument(parser)
-		if not document.is_extractable:
-			raise PDFTextExtractionNotAllowed
-		laparams = LAParams()
-		rsrcmgr = PDFResourceManager()
-		device = PDFPageAggregator(rsrcmgr, laparams=laparams)
-		interpreter = PDFPageInterpreter(rsrcmgr, device)
-		for page in PDFPage.create_pages(document):
-			interpreter.process_page(page)
-			layout = device.get_result()
-			text = []
-			if method == 'basic':
-				parse_text_basic(layout)
-			elif method == 'spreadsheet':
-				parse_text_spreadsheet(layout)
-			pdf_x, pdf_y = layout.bbox[2], layout.bbox[3]
-		text.sort(key=lambda x: (-x.y0, x.x0))
-	return text, pdf_x, pdf_y
+    with open(pdfname, 'r') as f:
+        parser = PDFParser(f)
+        document = PDFDocument(parser)
+        if not document.is_extractable:
+            raise PDFTextExtractionNotAllowed
+        laparams = LAParams()
+        rsrcmgr = PDFResourceManager()
+        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
+        interpreter = PDFPageInterpreter(rsrcmgr, device)
+        for page in PDFPage.create_pages(document):
+            interpreter.process_page(page)
+            layout = device.get_result()
+            if method == 'basic':
+                text = parse_text_basic(layout)
+            elif method == 'spreadsheet':
+                text = parse_text_spreadsheet(layout)
+            pdf_x, pdf_y = layout.bbox[2], layout.bbox[3]
+    return text, pdf_x, pdf_y
@@ -1,129 +1,175 @@
 import os
 import csv
+import cv2
 import glob
 import numpy as np
-import matplotlib.pyplot as plt

 from table import Table
 from pdf import get_pdf_info
-from morph_transform import morph
+from morph_transform import morph_transform
+from utils import (translate, scale, merge_close_values, get_row_idx,
+                   get_column_idx, reduce_index, outline, fill, remove_empty)

-def remove_close_values(ar):
-	ret = []
-	for a in ar:
-		if not ret:
-			ret.append(a)
-		else:
-			temp = ret[-1]
-			if np.isclose(temp, a, atol=2):
-				pass
-			else:
-				ret.append(a)
-	return ret

-def merge_close_values(ar):
-	ret = []
-	for a in ar:
-		if not ret:
-			ret.append(a)
-		else:
-			temp = ret[-1]
-			if np.isclose(temp, a, atol=2):
-				temp = (temp + a) / 2.0
-				ret[-1] = temp
-			else:
-				ret.append(a)
-	return ret
+def spreadsheet(pdf_dir, filename, fill, s, jtol, mtol, invert, debug):
+    if debug:
+        import matplotlib.pyplot as plt
+        import matplotlib.patches as patches
+    print "working on", filename
+    imagename = os.path.join(pdf_dir, filename.split('.')[0] + '.png')
+    img = cv2.imread(imagename)
+    img_x, img_y = img.shape[1], img.shape[0]
+    text, pdf_x, pdf_y = get_pdf_info(
+        os.path.join(pdf_dir, filename), 'spreadsheet')
+    scaling_factor_x = pdf_x / float(img_x)
+    scaling_factor_y = pdf_y / float(img_y)
+    tables, v_segments, h_segments = morph_transform(imagename, s, invert)

-def get_row_idx(t, rows):
-	for r in range(len(rows)):
-		if abs(t.y0 + t.y1) / 2.0 < rows[r][0] and abs(t.y0 + t.y1) / 2.0 > rows[r][1]:
-			return r
+    if debug == ["contours"]:
+        for t in tables.keys():
+            cv2.rectangle(img, (t[0], t[1]), (t[2], t[3]), (255, 0, 0), 3)
+        plt.imshow(img)
+    if debug == ["joints"]:
+        x_coord = []
+        y_coord = []
+        for k in tables.keys():
+            for coord in tables[k]:
+                x_coord.append(coord[0])
+                y_coord.append(coord[1])
+        max_x, max_y = max(x_coord), max(y_coord)
+        plt.plot(x_coord, y_coord, 'ro')
+        plt.axis([0, max_x + 100, max_y + 100, 0])
+        plt.imshow(img)

-def get_column_idx(t, columns):
-	for c in range(len(columns)):
-		if abs(t.x0 + t.x1) / 2.0 > columns[c][0] and abs(t.x0 + t.x1) / 2.0 < columns[c][1]:
-			return c
+    # detect if vertical
+    num_v = [t for t in text if (not t.upright) and t.get_text().strip()]
+    num_h = [t for t in text if t.upright and t.get_text().strip()]
+    vger = len(num_v) / float(len(num_v) + len(num_h))
+    rotated = ''
+    if vger > 0.8:
+        clockwise = sum(t.matrix[1] < 0 and t.matrix[2] > 0 for t in text)
+        anticlockwise = sum(t.matrix[1] > 0 and t.matrix[2] < 0 for t in text)
+        rotated = 'left' if clockwise < anticlockwise else 'right'

-def reduce_index(t, r_idx, c_idx):
-	if t.cells[r_idx][c_idx].spanning_h:
-		while not t.cells[r_idx][c_idx].left:
-			c_idx -= 1
-	if t.cells[r_idx][c_idx].spanning_v:
-		while not t.cells[r_idx][c_idx].top:
-			r_idx -= 1
-	return r_idx, c_idx
+    tables_new = {}
+    for k in tables.keys():
+        x1, y1, x2, y2 = k
+        x1 = scale(x1, scaling_factor_x)
+        y1 = scale(abs(translate(-img_y, y1)), scaling_factor_y)
+        x2 = scale(x2, scaling_factor_x)
+        y2 = scale(abs(translate(-img_y, y2)), scaling_factor_y)
+        j_x, j_y = zip(*tables[k])
+        j_x = [scale(j, scaling_factor_x) for j in j_x]
+        j_y = [scale(abs(translate(-img_y, j)), scaling_factor_y) for j in j_y]
+        joints = zip(j_x, j_y)
+        tables_new[(x1, y1, x2, y2)] = joints

-def fill(t, orientation):
-	if orientation == "h":
-		for i in range(len(t.cells)):
-			for j in range(len(t.cells[i])):
-				if t.cells[i][j].get_text().strip() == '':
-					if t.cells[i][j].spanning_h:
-						t.cells[i][j].add_text(t.cells[i][j - 1].get_text())
-	elif orientation == "v":
-		for i in range(len(t.cells)):
-			for j in range(len(t.cells[i])):
-				if t.cells[i][j].get_text().strip() == '':
-					if t.cells[i][j].spanning_v:
-						t.cells[i][j].add_text(t.cells[i - 1][j].get_text())
-	elif orientation == "hv":
-		for i in range(len(t.cells)):
-			for j in range(len(t.cells[i])):
-				if t.cells[i][j].get_text().strip() == '':
-					if t.cells[i][j].spanning_h:
-						t.cells[i][j].add_text(t.cells[i][j - 1].get_text())
-					elif t.cells[i][j].spanning_v:
-						t.cells[i][j].add_text(t.cells[i - 1][j].get_text())
-	return t
+    v_segments_new = []
+    for v in v_segments:
+        x1, x2 = scale(v[0], scaling_factor_x), scale(v[2], scaling_factor_x)
+        y1, y2 = scale(abs(translate(-img_y, v[1])), scaling_factor_y), scale(
+            abs(translate(-img_y, v[3])), scaling_factor_y)
+        v_segments_new.append((x1, y1, x2, y2))

-def spreadsheet(pdf_dir, filename, orientation, scale):
-	print "working on", filename
-	imagename = os.path.join(pdf_dir, filename.split('.')[0] + '.png')
-	text, pdf_x, pdf_y = get_pdf_info(os.path.join(pdf_dir, filename), 'spreadsheet')
-	tables, v_segments, h_segments = morph(imagename, pdf_x, pdf_y, scale)
+    h_segments_new = []
+    for h in h_segments:
+        x1, x2 = scale(h[0], scaling_factor_x), scale(h[2], scaling_factor_x)
+        y1, y2 = scale(abs(translate(-img_y, h[1])), scaling_factor_y), scale(
+            abs(translate(-img_y, h[3])), scaling_factor_y)
+        h_segments_new.append((x1, y1, x2, y2))

-	num_tables = 0
-	for k in sorted(tables.keys(), key=lambda x: x[1], reverse=True): # sort tables based on y-coord
-		# find rows and columns that lie in table
-		lb = (k[0], k[1])
-		rt = (k[2], k[3])
-		v_s = [v for v in v_segments if v[1] > lb[1] - 2 and v[3] < rt[1] + 2 and lb[0] - 2 <= v[0] <= rt[0] + 2]
-		h_s = [h for h in h_segments if h[0] > lb[0] - 2 and h[2] < rt[0] + 2 and lb[1] - 2 <= h[1] <= rt[1] + 2]
-		columns, rows = zip(*tables[k])
-		# sort horizontal and vertical segments
-		columns = merge_close_values(sorted(list(columns)))
-		rows = merge_close_values(sorted(list(rows), reverse=True))
-		# make grid using x and y coord of shortlisted rows and columns
-		columns = [(columns[i], columns[i + 1]) for i in range(0, len(columns) - 1)]
-		rows = [(rows[i], rows[i + 1]) for i in range(0, len(rows) - 1)]
+    num_tables = 0
+    # sort tables based on y-coord
+    for k in sorted(tables_new.keys(), key=lambda x: x[1], reverse=True):
+        # find rows and columns that lie in table
+        lb = (k[0], k[1])
+        rt = (k[2], k[3])
+        v_s = [v for v in v_segments_new if v[1] > lb[1] - 2 and v[3]
+               < rt[1] + 2 and lb[0] - 2 <= v[0] <= rt[0] + 2]
+        h_s = [h for h in h_segments_new if h[0] > lb[0] - 2 and h[2]
+               < rt[0] + 2 and lb[1] - 2 <= h[1] <= rt[1] + 2]

-		table = Table(columns, rows)
-		# pass row and column line segments to table method and light up cell edges
-		table = table.set_edges(v_s, h_s)
-		# table set span method
-		table = table.set_spanning()
-		# fill text after sorting it
-		text.sort(key=lambda x: (-x.y0, x.x0))
+        if debug == ["lines"]:
+            for v in v_s:
+                plt.plot([v[0], v[2]], [v[1], v[3]])
+            for h in h_s:
+                plt.plot([h[0], h[2]], [h[1], h[3]])

-		for t in text:
-			r_idx = get_row_idx(t, rows)
-			c_idx = get_column_idx(t, columns)
-			if None in [r_idx, c_idx]:
-				pass
-			else:
-				r_idx, c_idx = reduce_index(table, r_idx, c_idx)
-				table.cells[r_idx][c_idx].add_text(t.get_text().strip('\n'))
+        columns, rows = zip(*tables_new[k])
+        columns, rows = list(columns), list(rows)
+        columns.extend([lb[0], rt[0]])
+        rows.extend([lb[1], rt[1]])
+        # sort horizontal and vertical segments
+        columns = merge_close_values(sorted(columns), mtol)
+        rows = merge_close_values(sorted(rows, reverse=True), mtol)
+        # make grid using x and y coord of shortlisted rows and columns
+        columns = [(columns[i], columns[i + 1])
+                   for i in range(0, len(columns) - 1)]
+        rows = [(rows[i], rows[i + 1]) for i in range(0, len(rows) - 1)]

-		if orientation:
-			table = fill(table, orientation)
+        table = Table(columns, rows)
+        # light up cell edges
+        table = table.set_edges(v_s, h_s, jtol)
+        # table set span method
+        table = table.set_spanning()
+        # TODO
+        table = outline(table)

-		csvname = filename.split('.')[0] + ('_table_%d' % (num_tables + 1)) + '.csv'
-		csvpath = os.path.join(pdf_dir, csvname)
-		with open(csvpath, 'w') as outfile:
-			writer = csv.writer(outfile, quoting=csv.QUOTE_ALL)
-		 	for i in range(len(table.cells)):
-				writer.writerow([table.cells[i][j].get_text().strip().encode('utf-8') for j in range(len(table.cells[i]))])
-			print "saved as", csvname
-			print
-		num_tables += 1
+        if debug == ["tables"]:
+            for i in range(len(table.cells)):
+                for j in range(len(table.cells[i])):
+                    if table.cells[i][j].left:
+                        plt.plot([table.cells[i][j].lb[0], table.cells[i][j].lt[0]],
+                                 [table.cells[i][j].lb[1], table.cells[i][j].lt[1]])
+                    if table.cells[i][j].right:
+                        plt.plot([table.cells[i][j].rb[0], table.cells[i][j].rt[0]],
+                                 [table.cells[i][j].rb[1], table.cells[i][j].rt[1]])
+                    if table.cells[i][j].top:
+                        plt.plot([table.cells[i][j].lt[0], table.cells[i][j].rt[0]],
+                                 [table.cells[i][j].lt[1], table.cells[i][j].rt[1]])
+                    if table.cells[i][j].bottom:
+                        plt.plot([table.cells[i][j].lb[0], table.cells[i][j].rb[0]],
+                                 [table.cells[i][j].lb[1], table.cells[i][j].rb[1]])
+        if debug:
+            plt.show()
+
+        # fill text after sorting it
+        if not rotated:
+            text.sort(key=lambda x: (-x.y0, x.x0))
+        elif rotated == 'left':
+            text.sort(key=lambda x: (x.x0, x.y0))
+        elif rotated == 'right':
+            text.sort(key=lambda x: (-x.x0, -x.y0))
+
+        for t in text:
+            r_idx = get_row_idx(t, rows)
+            c_idx = get_column_idx(t, columns)
+            if None in [r_idx, c_idx]:
+                pass
+            else:
+                r_idx, c_idx = reduce_index(table, rotated, r_idx, c_idx)
+                table.cells[r_idx][c_idx].add_text(t.get_text().strip('\n'))
+
+        if fill:
+            table = fill(table, fill)
+
+        data = []
+        for i in range(len(table.cells)):
+            data.append([table.cells[i][j].get_text().strip().encode('utf-8')
+                         for j in range(len(table.cells[i]))])
+        if rotated == 'left':
+            data = zip(*data[::-1])
+        elif rotated == 'right':
+            data = zip(*data[::1])
+            data.reverse()
+        data = remove_empty(data)
+        csvname = filename.split(
+            '.')[0] + ('_table_%d' % (num_tables + 1)) + '.csv'
+        csvpath = os.path.join(pdf_dir, csvname)
+        with open(csvpath, 'w') as outfile:
+            writer = csv.writer(outfile, quoting=csv.QUOTE_ALL)
+            for d in data:
+                writer.writerow(d)
+            print "saved as", csvname
+            print
+        num_tables += 1
@@ -1,151 +1,152 @@
 import numpy as np
+
 from cell import Cell

+
 class Table:
-	def __init__(self, columns, rows):
-		self.cells = [[Cell(c[0], r[1], c[1], r[0]) for c in columns] for r in rows]
-		self.columns = columns
-		self.rows = rows

-	def set_edges(self, vertical, horizontal):
-		for v in vertical:
-			# find closest x coord
-			# iterate over y coords and find closest points
-			i = [i for i, t in enumerate(self.columns) if np.isclose(v[0], t[0], atol=2)]
-			j = [j for j, t in enumerate(self.rows) if np.isclose(v[3], t[0], atol=2)]
-			k = [k for k, t in enumerate(self.rows) if np.isclose(v[1], t[0], atol=2)]
-			if not j:
-				continue
-			if i == [0]: # only left edge
-				if k:
-					I = i[0]
-					J = j[0]
-					K = k[0]
-					while J < K:
-						self.cells[J][I].left = True
-						J += 1
-				else:
-					I = i[0]
-					J = j[0]
-					K = len(self.rows)
-					while J < K:
-						self.cells[J][I].left = True
-						J += 1
-			elif i == []: # only right edge
-				if k:
-					I = len(self.columns) - 1
-					J = j[0]
-					K = k[0]
-					while J < K:
-						self.cells[J][I].right = True
-						J += 1
-				else:
-					I = len(self.columns) - 1
-					J = j[0]
-					K = len(self.rows)
-					while J < K:
-						self.cells[J][I].right = True
-						J += 1
-			else: # both left and right edges
-				if k:
-					I = i[0]
-					J = j[0]
-					K = k[0]
-					while J < K:
-						self.cells[J][I].left = True
-						self.cells[J][I - 1].right = True
-						J += 1
-				else:
-					I = i[0]
-					J = j[0]
-					K = len(self.rows)
-					while J < K:
-						self.cells[J][I].left = True
-						self.cells[J][I - 1].right = True
-						J += 1
+    def __init__(self, columns, rows):
+        self.cells = [[Cell(c[0], r[1], c[1], r[0])
+                       for c in columns] for r in rows]
+        self.columns = columns
+        self.rows = rows

-		for h in horizontal:
-			#  find closest y coord
-			# iterate over x coords and find closest points
-			i = [i for i, t in enumerate(self.rows) if np.isclose(h[1], t[0], atol=2)]
-			j = [j for j, t in enumerate(self.columns) if np.isclose(h[0], t[0], atol=2)]
-			k = [k for k, t in enumerate(self.columns) if np.isclose(h[2], t[0], atol=2)]
-			if not j:
-				continue
-			if i == [0]: # only top edge
-				if k:
-					I = i[0]
-					J = j[0]
-					K = k[0]
-					while J < K:
-						self.cells[I][J].top = True
-						J += 1
-				else:
-					I = i[0]
-					J = j[0]
-					K = len(self.columns)
-					while J < K:
-						self.cells[I][J].top = True
-						J += 1
-			elif i == []: # only bottom edge
-				if k:
-					I = len(self.rows) - 1
-					J = j[0]
-					K = k[0]
-					while J < K:
-						self.cells[I][J].bottom = True
-						J += 1
-				else:
-					I = len(self.rows) - 1
-					J = j[0]
-					K = len(self.columns)
-					while J < K:
-						self.cells[I][J].bottom = True
-						J += 1
-			else: # both top and bottom edges
-				if k:
-					I = i[0]
-					J = j[0]
-					K = k[0]
-					while J < K:
-						self.cells[I][J].top = True
-						self.cells[I - 1][J].bottom = True
-						J += 1
-				else:
-					I = i[0]
-					J = j[0]
-					K = len(self.columns)
-					while J < K:
-						self.cells[I][J].top = True
-						self.cells[I - 1][J].bottom = True
-						J += 1
+    def set_edges(self, vertical, horizontal, jtol):
+        for v in vertical:
+            # find closest x coord
+            # iterate over y coords and find closest points
+            i = [i for i, t in enumerate(self.columns)
+                 if np.isclose(v[0], t[0], atol=jtol)]
+            j = [j for j, t in enumerate(self.rows)
+                 if np.isclose(v[3], t[0], atol=jtol)]
+            k = [k for k, t in enumerate(self.rows)
+                 if np.isclose(v[1], t[0], atol=jtol)]
+            if not j:
+                continue
+            J = j[0]
+            if i == [0]:  # only left edge
+                I = i[0]
+                if k:
+                    K = k[0]
+                    while J < K:
+                        self.cells[J][I].left = True
+                        J += 1
+                else:
+                    K = len(self.rows)
+                    while J < K:
+                        self.cells[J][I].left = True
+                        J += 1
+            elif i == []:  # only right edge
+                I = len(self.columns) - 1
+                if k:
+                    K = k[0]
+                    while J < K:
+                        self.cells[J][I].right = True
+                        J += 1
+                else:
+                    K = len(self.rows)
+                    while J < K:
+                        self.cells[J][I].right = True
+                        J += 1
+            else:  # both left and right edges
+                I = i[0]
+                if k:
+                    K = k[0]
+                    while J < K:
+                        self.cells[J][I].left = True
+                        self.cells[J][I - 1].right = True
+                        J += 1
+                else:
+                    K = len(self.rows)
+                    while J < K:
+                        self.cells[J][I].left = True
+                        self.cells[J][I - 1].right = True
+                        J += 1

-		return self
+        for h in horizontal:
+            #  find closest y coord
+            # iterate over x coords and find closest points
+            i = [i for i, t in enumerate(self.rows)
+                 if np.isclose(h[1], t[0], atol=jtol)]
+            j = [j for j, t in enumerate(self.columns)
+                 if np.isclose(h[0], t[0], atol=jtol)]
+            k = [k for k, t in enumerate(self.columns)
+                 if np.isclose(h[2], t[0], atol=jtol)]
+            if not j:
+                continue
+            J = j[0]
+            if i == [0]:  # only top edge
+                I = i[0]
+                if k:
+                    K = k[0]
+                    while J < K:
+                        self.cells[I][J].top = True
+                        J += 1
+                else:
+                    K = len(self.columns)
+                    while J < K:
+                        self.cells[I][J].top = True
+                        J += 1
+            elif i == []:  # only bottom edge
+                I = len(self.rows) - 1
+                if k:
+                    K = k[0]
+                    while J < K:
+                        self.cells[I][J].bottom = True
+                        J += 1
+                else:
+                    K = len(self.columns)
+                    while J < K:
+                        self.cells[I][J].bottom = True
+                        J += 1
+            else:  # both top and bottom edges
+                I = i[0]
+                if k:
+                    K = k[0]
+                    while J < K:
+                        self.cells[I][J].top = True
+                        self.cells[I - 1][J].bottom = True
+                        J += 1
+                else:
+                    K = len(self.columns)
+                    while J < K:
+                        self.cells[I][J].top = True
+                        self.cells[I - 1][J].bottom = True
+                        J += 1

-	def set_spanning(self):
-		for i in range(len(self.cells)):
-			for j in range(len(self.cells[i])):
-				bound = self.cells[i][j].get_bounded_edges()
-				if bound == 4:
-					continue
-				elif bound == 3:
-					if not self.cells[i][j].left:
-						if self.cells[i][j].right and self.cells[i][j].top and self.cells[i][j].bottom:
-							self.cells[i][j].spanning_h = True
-					elif not self.cells[i][j].right:
-						if self.cells[i][j].left and self.cells[i][j].top and self.cells[i][j].bottom:
-							self.cells[i][j].spanning_h = True
-					elif not self.cells[i][j].top:
-						if self.cells[i][j].left and self.cells[i][j].right and self.cells[i][j].bottom:
-							self.cells[i][j].spanning_v = True
-					elif not self.cells[i][j].bottom:
-						if self.cells[i][j].left and self.cells[i][j].right and self.cells[i][j].top:
-							self.cells[i][j].spanning_v = True
-				elif bound == 2:
-					if self.cells[i][j].left and self.cells[i][j].right:
-						if not self.cells[i][j].top and not self.cells[i][j].bottom:
-							self.cells[i][j].spanning_v = True
-					elif self.cells[i][j].top and self.cells[i][j].bottom:
-						if not self.cells[i][j].left and not self.cells[i][j].right:
-							self.cells[i][j].spanning_h = True
-		return self
+        return self
+
+    def set_spanning(self):
+        for i in range(len(self.cells)):
+            for j in range(len(self.cells[i])):
+                bound = self.cells[i][j].get_bounded_edges()
+                if bound == 4:
+                    continue
+
+                elif bound == 3:
+                    if not self.cells[i][j].left:
+                        if self.cells[i][j].right and self.cells[i][j].top and self.cells[i][j].bottom:
+                            self.cells[i][j].spanning_h = True
+
+                    elif not self.cells[i][j].right:
+                        if self.cells[i][j].left and self.cells[i][j].top and self.cells[i][j].bottom:
+                            self.cells[i][j].spanning_h = True
+
+                    elif not self.cells[i][j].top:
+                        if self.cells[i][j].left and self.cells[i][j].right and self.cells[i][j].bottom:
+                            self.cells[i][j].spanning_v = True
+
+                    elif not self.cells[i][j].bottom:
+                        if self.cells[i][j].left and self.cells[i][j].right and self.cells[i][j].top:
+                            self.cells[i][j].spanning_v = True
+
+                elif bound == 2:
+                    if self.cells[i][j].left and self.cells[i][j].right:
+                        if not self.cells[i][j].top and not self.cells[i][j].bottom:
+                            self.cells[i][j].spanning_v = True
+
+                    elif self.cells[i][j].top and self.cells[i][j].bottom:
+                        if not self.cells[i][j].left and not self.cells[i][j].right:
+                            self.cells[i][j].spanning_h = True
+
+        return self
@@ -0,0 +1,133 @@
+import numpy as np
+
+
+def translate(x1, x2):
+    x2 += x1
+    return x2
+
+
+def scale(x, s):
+    x *= s
+    return x
+
+
+def rotate(x1, y1, x2, y2, angle):
+    s = np.sin(angle)
+    c = np.cos(angle)
+    x2 = translate(-x1, x2)
+    y2 = translate(-y1, y2)
+    xnew = c * x2 - s * y2
+    ynew = s * x2 + c * y2
+    xnew = translate(x1, xnew)
+    ynew = translate(y1, ynew)
+    return xnew, ynew
+
+
+def remove_close_values(ar, mtol):
+    ret = []
+    for a in ar:
+        if not ret:
+            ret.append(a)
+        else:
+            temp = ret[-1]
+            if np.isclose(temp, a, atol=mtol):
+                pass
+            else:
+                ret.append(a)
+    return ret
+
+
+def merge_close_values(ar, mtol):
+    ret = []
+    for a in ar:
+        if not ret:
+            ret.append(a)
+        else:
+            temp = ret[-1]
+            if np.isclose(temp, a, atol=mtol):
+                temp = (temp + a) / 2.0
+                ret[-1] = temp
+            else:
+                ret.append(a)
+    return ret
+
+
+def get_row_idx(t, rows):
+    for r in range(len(rows)):
+        if (t.y0 + t.y1) / 2.0 < rows[r][0] and (t.y0 + t.y1) / 2.0 > rows[r][1]:
+            return r
+
+
+def get_column_idx(t, columns):
+    for c in range(len(columns)):
+        if (t.x0 + t.x1) / 2.0 > columns[c][0] and (t.x0 + t.x1) / 2.0 < columns[c][1]:
+            return c
+
+
+def reduce_index(t, rotated, r_idx, c_idx):
+    if not rotated:
+        if t.cells[r_idx][c_idx].spanning_h:
+            while not t.cells[r_idx][c_idx].left:
+                c_idx -= 1
+        if t.cells[r_idx][c_idx].spanning_v:
+            while not t.cells[r_idx][c_idx].top:
+                r_idx -= 1
+    elif rotated == 'left':
+        if t.cells[r_idx][c_idx].spanning_h:
+            while not t.cells[r_idx][c_idx].left:
+                c_idx -= 1
+        if t.cells[r_idx][c_idx].spanning_v:
+            while not t.cells[r_idx][c_idx].bottom:
+                r_idx += 1
+    elif rotated == 'right':
+        if t.cells[r_idx][c_idx].spanning_h:
+            while not t.cells[r_idx][c_idx].right:
+                c_idx += 1
+        if t.cells[r_idx][c_idx].spanning_v:
+            while not t.cells[r_idx][c_idx].top:
+                r_idx -= 1
+    return r_idx, c_idx
+
+
+def outline(t):
+    for i in range(len(t.cells)):
+        t.cells[i][0].left = True
+        t.cells[i][len(t.cells[i]) - 1].right = True
+    for i in range(len(t.cells[0])):
+        t.cells[0][i].top = True
+        t.cells[len(t.cells) - 1][i].bottom = True
+    return t
+
+
+def fill(t, f):
+    if f == "h":
+        for i in range(len(t.cells)):
+            for j in range(len(t.cells[i])):
+                if t.cells[i][j].get_text().strip() == '':
+                    if t.cells[i][j].spanning_h:
+                        t.cells[i][j].add_text(t.cells[i][j - 1].get_text())
+    elif f == "v":
+        for i in range(len(t.cells)):
+            for j in range(len(t.cells[i])):
+                if t.cells[i][j].get_text().strip() == '':
+                    if t.cells[i][j].spanning_v:
+                        t.cells[i][j].add_text(t.cells[i - 1][j].get_text())
+    elif f == "hv":
+        for i in range(len(t.cells)):
+            for j in range(len(t.cells[i])):
+                if t.cells[i][j].get_text().strip() == '':
+                    if t.cells[i][j].spanning_h:
+                        t.cells[i][j].add_text(t.cells[i][j - 1].get_text())
+                    elif t.cells[i][j].spanning_v:
+                        t.cells[i][j].add_text(t.cells[i - 1][j].get_text())
+    return t
+
+
+def remove_empty(d):
+    for i, row in enumerate(d):
+        if row == [''] * len(row):
+            d.pop(i)
+    d = zip(*d)
+    d = [list(row) for row in d if any(row)]
+    d = zip(*d)
+    return d