From b87d2350dc5927ddb9cd0e0cb9e6d84d8f593471 Mon Sep 17 00:00:00 2001 From: Vinayak Mehta Date: Mon, 11 Jul 2016 15:19:38 +0530 Subject: [PATCH] Make code PEP8 compliant --- README.md | 70 ++++++++--- basic.py | 109 ++++++++++------- camelot.py | 140 +++++++++++++--------- cell.py | 41 +++---- morph_transform.py | 124 ++++++++++---------- pdf.py | 84 ++++++------- spreadsheet.py | 266 ++++++++++++++++++++++++----------------- table.py | 287 +++++++++++++++++++++++---------------------- utils.py | 133 +++++++++++++++++++++ 9 files changed, 765 insertions(+), 489 deletions(-) create mode 100644 utils.py diff --git a/README.md b/README.md index b57fafe..7b2f5e7 100644 --- a/README.md +++ b/README.md @@ -1,30 +1,70 @@ Camelot ------- -usage: python2 camelot.py [options] pdf_file +Description: Parse tables from pdfs! -Parse yo pdf! +Dependencies + +Install + +Usage: python2 camelot.py [options] file positional arguments: + file optional arguments: - -h, --help show this help message and exit - -p PAGES [PAGES ...] Specify the page numbers and/or page ranges to be - parsed. Example: -p="1 3-5 9", -p="all" (default: - -p="1") + -h, --help - -f FORMAT Output format (csv/xlsx). Example: -f="xlsx" (default: - -f="csv") + show this help message and exit - -spreadsheet Extract data stored in pdfs with ruling lines. - (default: False) + -p, --pages PAGES [PAGES ...] - -F ORIENTATION Fill the values in empty cells. Example: -F="h", - -F="v", -F="hv" (default: None) + Specify the page numbers and/or page ranges to be + parsed. Example: -p="1 3-5 9", -p="all" (default: 1) - -s [SCALE] Scaling factor. Large scaling factor leads to smaller - lines being detected. (default: 15) + -f, --format FORMAT -Under construction... \ No newline at end of file + Output format (csv/xlsx). Example: -f="xlsx" (default: csv) + + -m, --spreadsheet + + Extract tables with ruling lines. (default: False) + + -F, --fill FILL + + Fill the values in empty cells horizontally(h) and/or + vertically(v). Example: -F="h", -F="v", -F="hv" (default: None) + + -s, --scale [SCALE] + + Scaling factor. Large scaling factor leads to smaller + lines being detected. (default: 15) + + -j, --jtol [JTOL] + + Tolerance to account for when comparing joint and line + coordinates. (default: 2) + + -M, --mtol [MTOL] + + Tolerance to account for when merging lines which are + very close. (default: 2) + + -i, --invert + + Make sure lines are in foreground. (default: False) + + -d, --debug DEBUG + + Debug by visualizing contours, lines, joints, tables. + Example: --debug="contours" + + -o, --output OUTPUT + + Specify output directory. + +Development: Code, Contributing, Tests + +License diff --git a/basic.py b/basic.py index bad348c..e7ce1f6 100644 --- a/basic.py +++ b/basic.py @@ -4,55 +4,76 @@ import numpy as np from pdf import get_pdf_info + def overlap(l): - merged = [] - for higher in l: - if not merged: - merged.append(higher) - else: - lower = merged[-1] - if higher[0] >= lower[0] and higher[1] <= lower[1]: - upper_bound = max(lower[1], higher[1]) - lower_bound = min(lower[0], higher[0]) - merged[-1] = (lower_bound, upper_bound) - else: - merged.append(higher) - return merged + merged = [] + for higher in l: + if not merged: + merged.append(higher) + else: + lower = merged[-1] + if higher[0] <= lower[1]: + upper_bound = max(lower[1], higher[1]) + lower_bound = min(lower[0], higher[0]) + merged[-1] = (lower_bound, upper_bound) + else: + merged.append(higher) + return merged + def get_row_idx(t, rows): - for r in range(len(rows)): - if t.y1 <= rows[r][0] and t.y0 >= rows[r][1]: - return r + for r in range(len(rows)): + if t.y1 <= rows[r][0] and t.y0 >= rows[r][1]: + return r + def get_column_idx(t, columns): - for c in range(len(columns)): - if t.x0 >= columns[c][0] and t.x1 <= columns[c][1]: - return c + for c in range(len(columns)): + if t.x0 >= columns[c][0] and t.x1 <= columns[c][1]: + return c + def basic(pdf_dir, filename): - print "working on", filename - text, _, _ = get_pdf_info(os.path.join(pdf_dir, filename), 'basic') - rows, columns = [], [] - for t in text: - rows.append((t.y1, t.y0)) - columns.append((t.x0, t.x1)) - rows = list(set(rows)) - rows = sorted(rows, reverse=True) - columns = list(set(columns)) - columns = sorted(columns) - columns = overlap(columns) - table = [['' for c in columns] for r in rows] - for t in text: - r_idx = get_row_idx(t, rows) - c_idx = get_column_idx(t, columns) - if None in [r_idx, c_idx]: - print t - else: - table[r_idx][c_idx] = t.get_text().strip('\n') + print "working on", filename + text, _, _ = get_pdf_info(os.path.join(pdf_dir, filename), 'basic') + text.sort(key=lambda x: (-x.y0, x.x0)) + y_last = 0 + data = [] + temp = [] + elements = [] + for t in text: + # is checking for upright necessary? + # if t.get_text().strip() and all([obj.upright for obj in t._objs if + # type(obj) is LTChar]): + if t.get_text().strip(): + if not np.isclose(y_last, t.y0, atol=2): + y_last = t.y0 + elements.append(len(temp)) + data.append(temp) + temp = [] + temp.append(t) + # a table can't have just 1 column, can it? + elements = filter(lambda x: x != 1, elements) + # mode = int(sys.argv[2]) if sys.argv[2] else max(set(elements), key=elements.count) + mode = max(set(elements), key=elements.count) + columns = [(t.x0, t.x1) for d in data for t in d if len(d) == mode] + columns = overlap(sorted(columns)) + columns = [(c[0] + c[1]) / 2.0 for c in columns] - csvname = filename.split('.')[0] + '.csv' - csvpath = os.path.join(pdf_dir, csvname) - with open(csvpath, 'w') as outfile: - writer = csv.writer(outfile, quoting=csv.QUOTE_ALL) - for cell in table: - writer.writerow([ce for ce in cell]) \ No newline at end of file + output = [['' for c in columns] for d in data] + for row, d in enumerate(data): + for t in d: + cog = (t.x0 + t.x1) / 2.0 + diff = [(i, abs(cog - c)) for i, c in enumerate(columns)] + idx = min(diff, key=lambda x: x[1]) + if output[row][idx[0]]: + output[row][idx[0]] += ' ' + t.get_text().strip() + else: + output[row][idx[0]] = t.get_text().strip() + + csvname = filename.split('.')[0] + '.csv' + csvpath = os.path.join(pdf_dir, csvname) + with open(csvpath, 'w') as outfile: + writer = csv.writer(outfile, quoting=csv.QUOTE_ALL) + for row in output: + writer.writerow([cell.encode('utf-8') for cell in row]) diff --git a/camelot.py b/camelot.py index 1da3b77..ec8fa96 100644 --- a/camelot.py +++ b/camelot.py @@ -12,90 +12,118 @@ from spreadsheet import spreadsheet pno = re.compile(r'\d+') + def mkdir(directory): if not os.path.isdir(directory): os.makedirs(directory) + def filesort(filename): - filename = filename.split('/')[-1] - num = pno.findall(filename) - if len(num) == 2: - return (int(num[0]), int(num[1])) - else: - return (int(num[0]), 0) + filename = filename.split('/')[-1] + num = pno.findall(filename) + if len(num) == 2: + return (int(num[0]), int(num[1])) + else: + return (int(num[0]), 0) start_time = time.time() CAMELOT_DIR = '.camelot/' mkdir(CAMELOT_DIR) -parser = argparse.ArgumentParser(description='Parse yo pdf!', usage='python2 camelot.py [options] pdf_file') -parser.add_argument('-p', nargs='+', action='store', dest='pages', help='Specify the page numbers and/or page ranges to be parsed. Example: -p="1 3-5 9", -p="all" (default: -p="1")') -parser.add_argument('-f', nargs=1, action='store', dest='format', help='Output format (csv/xlsx). Example: -f="xlsx" (default: -f="csv")', default=["csv"]) -parser.add_argument('-spreadsheet', action='store_true', dest='spreadsheet', help='Extract data stored in pdfs with ruling lines. (default: False)') -parser.add_argument('-F', action='store', dest='orientation', help='Fill the values in empty cells. Example: -F="h", -F="v", -F="hv" (default: None)', default=None) -parser.add_argument('-s', nargs='?', action='store', dest='scale', help='Scaling factor. Large scaling factor leads to smaller lines being detected. (default: 15)', default=15, type=int) +parser = argparse.ArgumentParser( + description='Parse tables from pdfs!', usage='python2 camelot.py [options] file') +parser.add_argument('-p', '--pages', nargs='+', action='store', dest='pages', + help='Specify the page numbers and/or page ranges to be parsed. Example: -p="1 3-5 9", -p="all" (default: 1)') +parser.add_argument('-f', '--format', nargs=1, action='store', dest='format', + help='Output format (csv/xlsx). Example: -f="xlsx" (default: csv)', default=["csv"]) +parser.add_argument('-m', '--spreadsheet', action='store_true', dest='spreadsheet', + help='Extract tables with ruling lines. (default: False)') +parser.add_argument('-F', '--fill', action='store', dest='fill', + help='Fill the values in empty cells horizontally(h) and/or vertically(v). Example: -F="h", -F="v", -F="hv" (default: None)', default=None) +parser.add_argument('-s', '--scale', nargs='?', action='store', dest='scale', + help='Scaling factor. Large scaling factor leads to smaller lines being detected. (default: 15)', default=15, type=int) +parser.add_argument('-j', '--jtol', nargs='?', action='store', + dest='jtol', help='Tolerance to account for when comparing joint and line coordinates. (default: 2)', default=2, type=int) +parser.add_argument('-M', '--mtol', nargs='?', action='store', + dest='mtol', help='Tolerance to account for when merging lines which are very close. (default: 2)', default=2, type=int) +parser.add_argument('-i', '--invert', action='store_true', dest='invert', + help='Make sure lines are in foreground. (default: False)') +parser.add_argument('-d', '--debug', nargs=1, action='store', dest='debug', + help='Debug by visualizing contours, lines, joints, tables. Example: --debug="contours"') +parser.add_argument('-o', '--output', nargs=1, action='store', dest='output', + help='Specify output directory.') parser.add_argument('file', nargs=1) result = parser.parse_args() if result.pages: - if result.pages == ['all']: - p = result.pages - else: - p = [] - for r in result.pages[0].split(' '): - if '-' in r: - a, b = r.split('-') - a, b = int(a), int(b) - p.extend([str(i) for i in range(a, b + 1)]) - else: - p.extend([str(r)]) + if result.pages == ['all']: + p = result.pages + else: + p = [] + for r in result.pages[0].split(' '): + if '-' in r: + a, b = r.split('-') + a, b = int(a), int(b) + p.extend([str(i) for i in range(a, b + 1)]) + else: + p.extend([str(r)]) else: - p = ['1'] + p = ['1'] p = sorted(set(p)) -s = result.spreadsheet - -pdf_dir = os.path.join(CAMELOT_DIR, os.urandom(16).encode('hex')) -mkdir(pdf_dir) filename = result.file[0].split('/')[-1] -logging.basicConfig(filename=os.path.join(pdf_dir, filename.split('.')[0] + '.log'), filemode='w', level=logging.DEBUG) +# pdf_dir = os.path.join(CAMELOT_DIR, os.urandom(16).encode('hex')) +pdf_dir = os.path.join(CAMELOT_DIR, filename.split('.')[0]) +mkdir(pdf_dir) +logging.basicConfig(filename=os.path.join(pdf_dir, filename.split('.')[ + 0] + '.log'), filemode='w', level=logging.DEBUG) shutil.copy(result.file[0], os.path.join(pdf_dir, filename)) print "separating pdf into pages" print if p == ['all']: - subprocess.call(['pdfseparate', os.path.join(pdf_dir, filename), os.path.join(pdf_dir, 'pg-%d.pdf')]) + subprocess.call(['pdfseparate', os.path.join( + pdf_dir, filename), os.path.join(pdf_dir, 'pg-%d.pdf')]) else: - for page in p: - subprocess.call(['pdfseparate', '-f', page, '-l', page, os.path.join(pdf_dir, filename), os.path.join(pdf_dir, 'pg-' + page + '.pdf')]) + for page in p: + subprocess.call(['pdfseparate', '-f', page, '-l', page, os.path.join( + pdf_dir, filename), os.path.join(pdf_dir, 'pg-' + page + '.pdf')]) -if s: - print "using the spreadsheet method" - for g in sorted(glob.glob(os.path.join(pdf_dir, 'pg-*.pdf'))): - print "converting", g.split('/')[-1], "to image" - os.system(' '.join(['convert', '-density', '300', g, '-depth', '8', g[:-4] + '.png'])) - spreadsheet(pdf_dir, g.split('/')[-1], result.orientation, result.scale) +if result.spreadsheet: + print "using the spreadsheet method" + for g in sorted(glob.glob(os.path.join(pdf_dir, 'pg-*.pdf'))): + print "converting", g.split('/')[-1], "to image" + os.system(' '.join(['convert', '-density', '300', + g, '-depth', '8', g[:-4] + '.png'])) + try: + spreadsheet(pdf_dir, g.split('/')[-1], result.fill, result.scale, + result.jtol, result.mtol, result.invert, result.debug) + except: + logging.error("Couldn't parse " + g.split('/')[-1]) + print "Couldn't parse", g.split('/')[-1] else: - print "using the basic method" - for g in sorted(glob.glob(os.path.join(pdf_dir, 'pg-*.pdf'))): - basic(pdf_dir, g.split('/')[-1]) + print "using the basic method" + for g in sorted(glob.glob(os.path.join(pdf_dir, 'pg-*.pdf'))): + basic(pdf_dir, g.split('/')[-1]) if result.format == ['xlsx']: - import csv - from pyexcel_xlsx import save_data - from collections import OrderedDict - data = OrderedDict() - for c in sorted(glob.glob(os.path.join(pdf_dir, '*.csv')), key=filesort): - print "adding", c.split('/')[-1], "to excel file" - with open(c, 'r') as csvfile: - reader = csv.reader(csvfile) - data.update({c.split('/')[-1].split('.')[0]: [row for row in reader]}) - xlsxname = filename.split('.')[0] + '.xlsx' - xlsxpath = os.path.join(pdf_dir, xlsxname) - save_data(xlsxpath, data) - print - print "saved as", xlsxname + import csv + from pyexcel_xlsx import save_data + from collections import OrderedDict + data = OrderedDict() + for c in sorted(glob.glob(os.path.join(pdf_dir, '*.csv')), key=filesort): + print "adding", c.split('/')[-1], "to excel file" + with open(c, 'r') as csvfile: + reader = csv.reader(csvfile) + data.update({c.split('/')[-1].split('.') + [0]: [row for row in reader]}) + xlsxname = filename.split('.')[0] + '.xlsx' + xlsxpath = os.path.join(pdf_dir, xlsxname) + save_data(xlsxpath, data) + print + print "saved as", xlsxname print "finished in", time.time() - start_time, "seconds" -logging.info("Time taken for " + filename + ": " + str(time.time() - start_time) + " seconds") +logging.info("Time taken for " + filename + ": " + + str(time.time() - start_time) + " seconds") diff --git a/cell.py b/cell.py index 6e21ce9..e2e91cb 100644 --- a/cell.py +++ b/cell.py @@ -1,23 +1,24 @@ class Cell: - def __init__(self, x1, y1, x2, y2): - self.lb = (x1, y1) - self.lt = (x1, y2) - self.rb = (x2, y1) - self.rt = (x2, y2) - self.bbox = (x1, y1, x2, y2) - self.left = False - self.right = False - self.top = False - self.bottom = False - self.text = '' - self.spanning_h = False - self.spanning_v = False - def add_text(self, text): - self.text += text - - def get_text(self): - return self.text + def __init__(self, x1, y1, x2, y2): + self.lb = (x1, y1) + self.lt = (x1, y2) + self.rb = (x2, y1) + self.rt = (x2, y2) + self.bbox = (x1, y1, x2, y2) + self.left = False + self.right = False + self.top = False + self.bottom = False + self.text = '' + self.spanning_h = False + self.spanning_v = False - def get_bounded_edges(self): - return self.top + self.bottom + self.left + self.right \ No newline at end of file + def add_text(self, text): + self.text += text + + def get_text(self): + return self.text + + def get_bounded_edges(self): + return self.top + self.bottom + self.left + self.right diff --git a/morph_transform.py b/morph_transform.py index ac7d5c2..09f0c16 100644 --- a/morph_transform.py +++ b/morph_transform.py @@ -1,73 +1,75 @@ import cv2 import numpy as np -def transform(x, y, img_x, img_y, pdf_x, pdf_y): - x *= pdf_x / float(img_x) - y = abs(y - img_y) - y *= pdf_y / float(img_y) - return x, y -# http://answers.opencv.org/question/63847/how-to-extract-tables-from-an-image/ -def morph(imagename, p_x, p_y, s): - img = cv2.imread(imagename) - img_x, img_y = img.shape[1], img.shape[0] - pdf_x, pdf_y = p_x, p_y - gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) - # empirical result taken from http://pequan.lip6.fr/~bereziat/pima/2012/seuillage/sezgin04.pdf - threshold = cv2.adaptiveThreshold(np.invert(gray), 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 15, -0.2) - vertical = threshold - horizontal = threshold +def morph_transform(imagename, s, invert): + # http://answers.opencv.org/question/63847/how-to-extract-tables-from-an-image/ + img = cv2.imread(imagename) + img_x, img_y = img.shape[1], img.shape[0] + gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) + # empirical result taken from + # http://pequan.lip6.fr/~bereziat/pima/2012/seuillage/sezgin04.pdf + if invert: + threshold = cv2.adaptiveThreshold( + gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 15, -0.2) + else: + threshold = cv2.adaptiveThreshold(np.invert( + gray), 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 15, -0.2) + vertical = threshold + horizontal = threshold - scale = s - verticalsize = vertical.shape[0] / scale - horizontalsize = horizontal.shape[1] / scale + scale = s + verticalsize = vertical.shape[0] / scale + horizontalsize = horizontal.shape[1] / scale - ver = cv2.getStructuringElement(cv2.MORPH_RECT, (1, verticalsize)) - hor = cv2.getStructuringElement(cv2.MORPH_RECT, (horizontalsize, 1)) + ver = cv2.getStructuringElement(cv2.MORPH_RECT, (1, verticalsize)) + hor = cv2.getStructuringElement(cv2.MORPH_RECT, (horizontalsize, 1)) - vertical = cv2.erode(vertical, ver, (-1, -1)) - vertical = cv2.dilate(vertical, ver, (-1, -1)) + vertical = cv2.erode(vertical, ver, (-1, -1)) + vertical = cv2.dilate(vertical, ver, (-1, -1)) - horizontal = cv2.erode(horizontal, hor, (-1, -1)) - horizontal = cv2.dilate(horizontal, hor, (-1, -1)) + horizontal = cv2.erode(horizontal, hor, (-1, -1)) + horizontal = cv2.dilate(horizontal, hor, (-1, -1)) - mask = vertical + horizontal - joints = np.bitwise_and(vertical, horizontal) - _, contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) - contours = sorted(contours, key=cv2.contourArea, reverse=True)[:10] + mask = vertical + horizontal + joints = np.bitwise_and(vertical, horizontal) + _, contours, _ = cv2.findContours( + mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) + contours = sorted(contours, key=cv2.contourArea, reverse=True)[:10] - tables = {} - for c in contours: - c_poly = cv2.approxPolyDP(c, 3, True) - x, y, w, h = cv2.boundingRect(c_poly) - # find number of non-zero values in joints using what boundingRect returns - roi = joints[y:y+h, x:x+w] - _, jc, _ = cv2.findContours(roi, cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE) - if len(jc) <= 4: # remove contours with less than <=4 joints - continue - joint_coords = [] - for j in jc: - jx, jy, jw, jh = cv2.boundingRect(j) - c1, c2 = x + (2*jx + jw) / 2, y + (2*jy + jh) / 2 - c1, c2 = transform(c1, c2, img_x, img_y, pdf_x, pdf_y) - joint_coords.append((c1, c2)) - x1, y1 = transform(x, y, img_x, img_y, pdf_x, pdf_y) - x2, y2 = transform(x + w, y + h, img_x, img_y, pdf_x, pdf_y) - tables[(x1, y2, x2, y1)] = joint_coords + tables = {} + for c in contours: + c_poly = cv2.approxPolyDP(c, 3, True) + x, y, w, h = cv2.boundingRect(c_poly) + # find number of non-zero values in joints using what boundingRect + # returns + roi = joints[y:y + h, x:x + w] + _, jc, _ = cv2.findContours( + roi, cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE) + if len(jc) <= 4: # remove contours with less than <=4 joints + continue + joint_coords = [] + for j in jc: + jx, jy, jw, jh = cv2.boundingRect(j) + c1, c2 = x + (2 * jx + jw) / 2, y + (2 * jy + jh) / 2 + joint_coords.append((c1, c2)) + tables[(x, y + h, x + w, y)] = joint_coords - v_segments, h_segments = [], [] - _, vcontours, _ = cv2.findContours(vertical, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) - for vc in vcontours: - x, y, w, h = cv2.boundingRect(vc) - x1, y1 = transform(x, y, img_x, img_y, pdf_x, pdf_y) - x2, y2 = transform(x + w, y + h, img_x, img_y, pdf_x, pdf_y) - v_segments.append(((x1 + x2) / 2, y2, (x1 + x2) / 2, y1)) - - _, hcontours, _ = cv2.findContours(horizontal, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) - for hc in hcontours: - x, y, w, h = cv2.boundingRect(hc) - x1, y1 = transform(x, y, img_x, img_y, pdf_x, pdf_y) - x2, y2 = transform(x + w, y + h, img_x, img_y, pdf_x, pdf_y) - h_segments.append((x1, (y1 + y2) / 2, x2, (y1 + y2) / 2)) + v_segments, h_segments = [], [] + _, vcontours, _ = cv2.findContours( + vertical, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) + for vc in vcontours: + x, y, w, h = cv2.boundingRect(vc) + x1, x2 = x, x + w + y1, y2 = y, y + h + v_segments.append(((x1 + x2) / 2, y2, (x1 + x2) / 2, y1)) - return tables, v_segments, h_segments \ No newline at end of file + _, hcontours, _ = cv2.findContours( + horizontal, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) + for hc in hcontours: + x, y, w, h = cv2.boundingRect(hc) + x1, x2 = x, x + w + y1, y2 = y, y + h + h_segments.append((x1, (y1 + y2) / 2, x2, (y1 + y2) / 2)) + + return tables, v_segments, h_segments diff --git a/pdf.py b/pdf.py index d210953..3c603e0 100644 --- a/pdf.py +++ b/pdf.py @@ -8,47 +8,51 @@ from pdfminer.pdfdevice import PDFDevice from pdfminer.converter import PDFPageAggregator from pdfminer.layout import LAParams, LTChar, LTTextLineHorizontal -text = [] -def parse_text_basic(layout): - global text - try: - for obj in layout._objs: - if type(obj) is LTTextLineHorizontal: - text.append(obj) - parse_text_basic(obj) - except AttributeError: - pass +def parse_text_basic(layout, t=None): + if t is None: + t = [] + try: + for obj in layout._objs: + if type(obj) is LTTextLineHorizontal: + t.append(obj) + else: + t += parse_text_basic(obj) + except AttributeError: + pass + return t + + +def parse_text_spreadsheet(layout, t=None): + if t is None: + t = [] + try: + for obj in layout._objs: + if type(obj) is LTChar: + t.append(obj) + else: + t += parse_text_spreadsheet(obj) + except AttributeError: + pass + return t -def parse_text_spreadsheet(layout): - global text - try: - for obj in layout._objs: - if type(obj) is LTChar: - text.append(obj) - parse_text_spreadsheet(obj) - except AttributeError: - pass def get_pdf_info(pdfname, method): - global text - with open(pdfname, 'r') as f: - parser = PDFParser(f) - document = PDFDocument(parser) - if not document.is_extractable: - raise PDFTextExtractionNotAllowed - laparams = LAParams() - rsrcmgr = PDFResourceManager() - device = PDFPageAggregator(rsrcmgr, laparams=laparams) - interpreter = PDFPageInterpreter(rsrcmgr, device) - for page in PDFPage.create_pages(document): - interpreter.process_page(page) - layout = device.get_result() - text = [] - if method == 'basic': - parse_text_basic(layout) - elif method == 'spreadsheet': - parse_text_spreadsheet(layout) - pdf_x, pdf_y = layout.bbox[2], layout.bbox[3] - text.sort(key=lambda x: (-x.y0, x.x0)) - return text, pdf_x, pdf_y \ No newline at end of file + with open(pdfname, 'r') as f: + parser = PDFParser(f) + document = PDFDocument(parser) + if not document.is_extractable: + raise PDFTextExtractionNotAllowed + laparams = LAParams() + rsrcmgr = PDFResourceManager() + device = PDFPageAggregator(rsrcmgr, laparams=laparams) + interpreter = PDFPageInterpreter(rsrcmgr, device) + for page in PDFPage.create_pages(document): + interpreter.process_page(page) + layout = device.get_result() + if method == 'basic': + text = parse_text_basic(layout) + elif method == 'spreadsheet': + text = parse_text_spreadsheet(layout) + pdf_x, pdf_y = layout.bbox[2], layout.bbox[3] + return text, pdf_x, pdf_y diff --git a/spreadsheet.py b/spreadsheet.py index fa79012..8d04a4c 100644 --- a/spreadsheet.py +++ b/spreadsheet.py @@ -1,129 +1,175 @@ import os import csv +import cv2 import glob import numpy as np -import matplotlib.pyplot as plt from table import Table from pdf import get_pdf_info -from morph_transform import morph +from morph_transform import morph_transform +from utils import (translate, scale, merge_close_values, get_row_idx, + get_column_idx, reduce_index, outline, fill, remove_empty) -def remove_close_values(ar): - ret = [] - for a in ar: - if not ret: - ret.append(a) - else: - temp = ret[-1] - if np.isclose(temp, a, atol=2): - pass - else: - ret.append(a) - return ret -def merge_close_values(ar): - ret = [] - for a in ar: - if not ret: - ret.append(a) - else: - temp = ret[-1] - if np.isclose(temp, a, atol=2): - temp = (temp + a) / 2.0 - ret[-1] = temp - else: - ret.append(a) - return ret +def spreadsheet(pdf_dir, filename, fill, s, jtol, mtol, invert, debug): + if debug: + import matplotlib.pyplot as plt + import matplotlib.patches as patches + print "working on", filename + imagename = os.path.join(pdf_dir, filename.split('.')[0] + '.png') + img = cv2.imread(imagename) + img_x, img_y = img.shape[1], img.shape[0] + text, pdf_x, pdf_y = get_pdf_info( + os.path.join(pdf_dir, filename), 'spreadsheet') + scaling_factor_x = pdf_x / float(img_x) + scaling_factor_y = pdf_y / float(img_y) + tables, v_segments, h_segments = morph_transform(imagename, s, invert) -def get_row_idx(t, rows): - for r in range(len(rows)): - if abs(t.y0 + t.y1) / 2.0 < rows[r][0] and abs(t.y0 + t.y1) / 2.0 > rows[r][1]: - return r + if debug == ["contours"]: + for t in tables.keys(): + cv2.rectangle(img, (t[0], t[1]), (t[2], t[3]), (255, 0, 0), 3) + plt.imshow(img) + if debug == ["joints"]: + x_coord = [] + y_coord = [] + for k in tables.keys(): + for coord in tables[k]: + x_coord.append(coord[0]) + y_coord.append(coord[1]) + max_x, max_y = max(x_coord), max(y_coord) + plt.plot(x_coord, y_coord, 'ro') + plt.axis([0, max_x + 100, max_y + 100, 0]) + plt.imshow(img) -def get_column_idx(t, columns): - for c in range(len(columns)): - if abs(t.x0 + t.x1) / 2.0 > columns[c][0] and abs(t.x0 + t.x1) / 2.0 < columns[c][1]: - return c + # detect if vertical + num_v = [t for t in text if (not t.upright) and t.get_text().strip()] + num_h = [t for t in text if t.upright and t.get_text().strip()] + vger = len(num_v) / float(len(num_v) + len(num_h)) + rotated = '' + if vger > 0.8: + clockwise = sum(t.matrix[1] < 0 and t.matrix[2] > 0 for t in text) + anticlockwise = sum(t.matrix[1] > 0 and t.matrix[2] < 0 for t in text) + rotated = 'left' if clockwise < anticlockwise else 'right' -def reduce_index(t, r_idx, c_idx): - if t.cells[r_idx][c_idx].spanning_h: - while not t.cells[r_idx][c_idx].left: - c_idx -= 1 - if t.cells[r_idx][c_idx].spanning_v: - while not t.cells[r_idx][c_idx].top: - r_idx -= 1 - return r_idx, c_idx + tables_new = {} + for k in tables.keys(): + x1, y1, x2, y2 = k + x1 = scale(x1, scaling_factor_x) + y1 = scale(abs(translate(-img_y, y1)), scaling_factor_y) + x2 = scale(x2, scaling_factor_x) + y2 = scale(abs(translate(-img_y, y2)), scaling_factor_y) + j_x, j_y = zip(*tables[k]) + j_x = [scale(j, scaling_factor_x) for j in j_x] + j_y = [scale(abs(translate(-img_y, j)), scaling_factor_y) for j in j_y] + joints = zip(j_x, j_y) + tables_new[(x1, y1, x2, y2)] = joints -def fill(t, orientation): - if orientation == "h": - for i in range(len(t.cells)): - for j in range(len(t.cells[i])): - if t.cells[i][j].get_text().strip() == '': - if t.cells[i][j].spanning_h: - t.cells[i][j].add_text(t.cells[i][j - 1].get_text()) - elif orientation == "v": - for i in range(len(t.cells)): - for j in range(len(t.cells[i])): - if t.cells[i][j].get_text().strip() == '': - if t.cells[i][j].spanning_v: - t.cells[i][j].add_text(t.cells[i - 1][j].get_text()) - elif orientation == "hv": - for i in range(len(t.cells)): - for j in range(len(t.cells[i])): - if t.cells[i][j].get_text().strip() == '': - if t.cells[i][j].spanning_h: - t.cells[i][j].add_text(t.cells[i][j - 1].get_text()) - elif t.cells[i][j].spanning_v: - t.cells[i][j].add_text(t.cells[i - 1][j].get_text()) - return t + v_segments_new = [] + for v in v_segments: + x1, x2 = scale(v[0], scaling_factor_x), scale(v[2], scaling_factor_x) + y1, y2 = scale(abs(translate(-img_y, v[1])), scaling_factor_y), scale( + abs(translate(-img_y, v[3])), scaling_factor_y) + v_segments_new.append((x1, y1, x2, y2)) -def spreadsheet(pdf_dir, filename, orientation, scale): - print "working on", filename - imagename = os.path.join(pdf_dir, filename.split('.')[0] + '.png') - text, pdf_x, pdf_y = get_pdf_info(os.path.join(pdf_dir, filename), 'spreadsheet') - tables, v_segments, h_segments = morph(imagename, pdf_x, pdf_y, scale) + h_segments_new = [] + for h in h_segments: + x1, x2 = scale(h[0], scaling_factor_x), scale(h[2], scaling_factor_x) + y1, y2 = scale(abs(translate(-img_y, h[1])), scaling_factor_y), scale( + abs(translate(-img_y, h[3])), scaling_factor_y) + h_segments_new.append((x1, y1, x2, y2)) - num_tables = 0 - for k in sorted(tables.keys(), key=lambda x: x[1], reverse=True): # sort tables based on y-coord - # find rows and columns that lie in table - lb = (k[0], k[1]) - rt = (k[2], k[3]) - v_s = [v for v in v_segments if v[1] > lb[1] - 2 and v[3] < rt[1] + 2 and lb[0] - 2 <= v[0] <= rt[0] + 2] - h_s = [h for h in h_segments if h[0] > lb[0] - 2 and h[2] < rt[0] + 2 and lb[1] - 2 <= h[1] <= rt[1] + 2] - columns, rows = zip(*tables[k]) - # sort horizontal and vertical segments - columns = merge_close_values(sorted(list(columns))) - rows = merge_close_values(sorted(list(rows), reverse=True)) - # make grid using x and y coord of shortlisted rows and columns - columns = [(columns[i], columns[i + 1]) for i in range(0, len(columns) - 1)] - rows = [(rows[i], rows[i + 1]) for i in range(0, len(rows) - 1)] + num_tables = 0 + # sort tables based on y-coord + for k in sorted(tables_new.keys(), key=lambda x: x[1], reverse=True): + # find rows and columns that lie in table + lb = (k[0], k[1]) + rt = (k[2], k[3]) + v_s = [v for v in v_segments_new if v[1] > lb[1] - 2 and v[3] + < rt[1] + 2 and lb[0] - 2 <= v[0] <= rt[0] + 2] + h_s = [h for h in h_segments_new if h[0] > lb[0] - 2 and h[2] + < rt[0] + 2 and lb[1] - 2 <= h[1] <= rt[1] + 2] - table = Table(columns, rows) - # pass row and column line segments to table method and light up cell edges - table = table.set_edges(v_s, h_s) - # table set span method - table = table.set_spanning() - # fill text after sorting it - text.sort(key=lambda x: (-x.y0, x.x0)) + if debug == ["lines"]: + for v in v_s: + plt.plot([v[0], v[2]], [v[1], v[3]]) + for h in h_s: + plt.plot([h[0], h[2]], [h[1], h[3]]) - for t in text: - r_idx = get_row_idx(t, rows) - c_idx = get_column_idx(t, columns) - if None in [r_idx, c_idx]: - pass - else: - r_idx, c_idx = reduce_index(table, r_idx, c_idx) - table.cells[r_idx][c_idx].add_text(t.get_text().strip('\n')) + columns, rows = zip(*tables_new[k]) + columns, rows = list(columns), list(rows) + columns.extend([lb[0], rt[0]]) + rows.extend([lb[1], rt[1]]) + # sort horizontal and vertical segments + columns = merge_close_values(sorted(columns), mtol) + rows = merge_close_values(sorted(rows, reverse=True), mtol) + # make grid using x and y coord of shortlisted rows and columns + columns = [(columns[i], columns[i + 1]) + for i in range(0, len(columns) - 1)] + rows = [(rows[i], rows[i + 1]) for i in range(0, len(rows) - 1)] - if orientation: - table = fill(table, orientation) + table = Table(columns, rows) + # light up cell edges + table = table.set_edges(v_s, h_s, jtol) + # table set span method + table = table.set_spanning() + # TODO + table = outline(table) - csvname = filename.split('.')[0] + ('_table_%d' % (num_tables + 1)) + '.csv' - csvpath = os.path.join(pdf_dir, csvname) - with open(csvpath, 'w') as outfile: - writer = csv.writer(outfile, quoting=csv.QUOTE_ALL) - for i in range(len(table.cells)): - writer.writerow([table.cells[i][j].get_text().strip().encode('utf-8') for j in range(len(table.cells[i]))]) - print "saved as", csvname - print - num_tables += 1 \ No newline at end of file + if debug == ["tables"]: + for i in range(len(table.cells)): + for j in range(len(table.cells[i])): + if table.cells[i][j].left: + plt.plot([table.cells[i][j].lb[0], table.cells[i][j].lt[0]], + [table.cells[i][j].lb[1], table.cells[i][j].lt[1]]) + if table.cells[i][j].right: + plt.plot([table.cells[i][j].rb[0], table.cells[i][j].rt[0]], + [table.cells[i][j].rb[1], table.cells[i][j].rt[1]]) + if table.cells[i][j].top: + plt.plot([table.cells[i][j].lt[0], table.cells[i][j].rt[0]], + [table.cells[i][j].lt[1], table.cells[i][j].rt[1]]) + if table.cells[i][j].bottom: + plt.plot([table.cells[i][j].lb[0], table.cells[i][j].rb[0]], + [table.cells[i][j].lb[1], table.cells[i][j].rb[1]]) + if debug: + plt.show() + + # fill text after sorting it + if not rotated: + text.sort(key=lambda x: (-x.y0, x.x0)) + elif rotated == 'left': + text.sort(key=lambda x: (x.x0, x.y0)) + elif rotated == 'right': + text.sort(key=lambda x: (-x.x0, -x.y0)) + + for t in text: + r_idx = get_row_idx(t, rows) + c_idx = get_column_idx(t, columns) + if None in [r_idx, c_idx]: + pass + else: + r_idx, c_idx = reduce_index(table, rotated, r_idx, c_idx) + table.cells[r_idx][c_idx].add_text(t.get_text().strip('\n')) + + if fill: + table = fill(table, fill) + + data = [] + for i in range(len(table.cells)): + data.append([table.cells[i][j].get_text().strip().encode('utf-8') + for j in range(len(table.cells[i]))]) + if rotated == 'left': + data = zip(*data[::-1]) + elif rotated == 'right': + data = zip(*data[::1]) + data.reverse() + data = remove_empty(data) + csvname = filename.split( + '.')[0] + ('_table_%d' % (num_tables + 1)) + '.csv' + csvpath = os.path.join(pdf_dir, csvname) + with open(csvpath, 'w') as outfile: + writer = csv.writer(outfile, quoting=csv.QUOTE_ALL) + for d in data: + writer.writerow(d) + print "saved as", csvname + print + num_tables += 1 diff --git a/table.py b/table.py index dd1c34e..3e4e338 100644 --- a/table.py +++ b/table.py @@ -1,151 +1,152 @@ import numpy as np + from cell import Cell + class Table: - def __init__(self, columns, rows): - self.cells = [[Cell(c[0], r[1], c[1], r[0]) for c in columns] for r in rows] - self.columns = columns - self.rows = rows - def set_edges(self, vertical, horizontal): - for v in vertical: - # find closest x coord - # iterate over y coords and find closest points - i = [i for i, t in enumerate(self.columns) if np.isclose(v[0], t[0], atol=2)] - j = [j for j, t in enumerate(self.rows) if np.isclose(v[3], t[0], atol=2)] - k = [k for k, t in enumerate(self.rows) if np.isclose(v[1], t[0], atol=2)] - if not j: - continue - if i == [0]: # only left edge - if k: - I = i[0] - J = j[0] - K = k[0] - while J < K: - self.cells[J][I].left = True - J += 1 - else: - I = i[0] - J = j[0] - K = len(self.rows) - while J < K: - self.cells[J][I].left = True - J += 1 - elif i == []: # only right edge - if k: - I = len(self.columns) - 1 - J = j[0] - K = k[0] - while J < K: - self.cells[J][I].right = True - J += 1 - else: - I = len(self.columns) - 1 - J = j[0] - K = len(self.rows) - while J < K: - self.cells[J][I].right = True - J += 1 - else: # both left and right edges - if k: - I = i[0] - J = j[0] - K = k[0] - while J < K: - self.cells[J][I].left = True - self.cells[J][I - 1].right = True - J += 1 - else: - I = i[0] - J = j[0] - K = len(self.rows) - while J < K: - self.cells[J][I].left = True - self.cells[J][I - 1].right = True - J += 1 + def __init__(self, columns, rows): + self.cells = [[Cell(c[0], r[1], c[1], r[0]) + for c in columns] for r in rows] + self.columns = columns + self.rows = rows - for h in horizontal: - # find closest y coord - # iterate over x coords and find closest points - i = [i for i, t in enumerate(self.rows) if np.isclose(h[1], t[0], atol=2)] - j = [j for j, t in enumerate(self.columns) if np.isclose(h[0], t[0], atol=2)] - k = [k for k, t in enumerate(self.columns) if np.isclose(h[2], t[0], atol=2)] - if not j: - continue - if i == [0]: # only top edge - if k: - I = i[0] - J = j[0] - K = k[0] - while J < K: - self.cells[I][J].top = True - J += 1 - else: - I = i[0] - J = j[0] - K = len(self.columns) - while J < K: - self.cells[I][J].top = True - J += 1 - elif i == []: # only bottom edge - if k: - I = len(self.rows) - 1 - J = j[0] - K = k[0] - while J < K: - self.cells[I][J].bottom = True - J += 1 - else: - I = len(self.rows) - 1 - J = j[0] - K = len(self.columns) - while J < K: - self.cells[I][J].bottom = True - J += 1 - else: # both top and bottom edges - if k: - I = i[0] - J = j[0] - K = k[0] - while J < K: - self.cells[I][J].top = True - self.cells[I - 1][J].bottom = True - J += 1 - else: - I = i[0] - J = j[0] - K = len(self.columns) - while J < K: - self.cells[I][J].top = True - self.cells[I - 1][J].bottom = True - J += 1 + def set_edges(self, vertical, horizontal, jtol): + for v in vertical: + # find closest x coord + # iterate over y coords and find closest points + i = [i for i, t in enumerate(self.columns) + if np.isclose(v[0], t[0], atol=jtol)] + j = [j for j, t in enumerate(self.rows) + if np.isclose(v[3], t[0], atol=jtol)] + k = [k for k, t in enumerate(self.rows) + if np.isclose(v[1], t[0], atol=jtol)] + if not j: + continue + J = j[0] + if i == [0]: # only left edge + I = i[0] + if k: + K = k[0] + while J < K: + self.cells[J][I].left = True + J += 1 + else: + K = len(self.rows) + while J < K: + self.cells[J][I].left = True + J += 1 + elif i == []: # only right edge + I = len(self.columns) - 1 + if k: + K = k[0] + while J < K: + self.cells[J][I].right = True + J += 1 + else: + K = len(self.rows) + while J < K: + self.cells[J][I].right = True + J += 1 + else: # both left and right edges + I = i[0] + if k: + K = k[0] + while J < K: + self.cells[J][I].left = True + self.cells[J][I - 1].right = True + J += 1 + else: + K = len(self.rows) + while J < K: + self.cells[J][I].left = True + self.cells[J][I - 1].right = True + J += 1 - return self + for h in horizontal: + # find closest y coord + # iterate over x coords and find closest points + i = [i for i, t in enumerate(self.rows) + if np.isclose(h[1], t[0], atol=jtol)] + j = [j for j, t in enumerate(self.columns) + if np.isclose(h[0], t[0], atol=jtol)] + k = [k for k, t in enumerate(self.columns) + if np.isclose(h[2], t[0], atol=jtol)] + if not j: + continue + J = j[0] + if i == [0]: # only top edge + I = i[0] + if k: + K = k[0] + while J < K: + self.cells[I][J].top = True + J += 1 + else: + K = len(self.columns) + while J < K: + self.cells[I][J].top = True + J += 1 + elif i == []: # only bottom edge + I = len(self.rows) - 1 + if k: + K = k[0] + while J < K: + self.cells[I][J].bottom = True + J += 1 + else: + K = len(self.columns) + while J < K: + self.cells[I][J].bottom = True + J += 1 + else: # both top and bottom edges + I = i[0] + if k: + K = k[0] + while J < K: + self.cells[I][J].top = True + self.cells[I - 1][J].bottom = True + J += 1 + else: + K = len(self.columns) + while J < K: + self.cells[I][J].top = True + self.cells[I - 1][J].bottom = True + J += 1 - def set_spanning(self): - for i in range(len(self.cells)): - for j in range(len(self.cells[i])): - bound = self.cells[i][j].get_bounded_edges() - if bound == 4: - continue - elif bound == 3: - if not self.cells[i][j].left: - if self.cells[i][j].right and self.cells[i][j].top and self.cells[i][j].bottom: - self.cells[i][j].spanning_h = True - elif not self.cells[i][j].right: - if self.cells[i][j].left and self.cells[i][j].top and self.cells[i][j].bottom: - self.cells[i][j].spanning_h = True - elif not self.cells[i][j].top: - if self.cells[i][j].left and self.cells[i][j].right and self.cells[i][j].bottom: - self.cells[i][j].spanning_v = True - elif not self.cells[i][j].bottom: - if self.cells[i][j].left and self.cells[i][j].right and self.cells[i][j].top: - self.cells[i][j].spanning_v = True - elif bound == 2: - if self.cells[i][j].left and self.cells[i][j].right: - if not self.cells[i][j].top and not self.cells[i][j].bottom: - self.cells[i][j].spanning_v = True - elif self.cells[i][j].top and self.cells[i][j].bottom: - if not self.cells[i][j].left and not self.cells[i][j].right: - self.cells[i][j].spanning_h = True - return self \ No newline at end of file + return self + + def set_spanning(self): + for i in range(len(self.cells)): + for j in range(len(self.cells[i])): + bound = self.cells[i][j].get_bounded_edges() + if bound == 4: + continue + + elif bound == 3: + if not self.cells[i][j].left: + if self.cells[i][j].right and self.cells[i][j].top and self.cells[i][j].bottom: + self.cells[i][j].spanning_h = True + + elif not self.cells[i][j].right: + if self.cells[i][j].left and self.cells[i][j].top and self.cells[i][j].bottom: + self.cells[i][j].spanning_h = True + + elif not self.cells[i][j].top: + if self.cells[i][j].left and self.cells[i][j].right and self.cells[i][j].bottom: + self.cells[i][j].spanning_v = True + + elif not self.cells[i][j].bottom: + if self.cells[i][j].left and self.cells[i][j].right and self.cells[i][j].top: + self.cells[i][j].spanning_v = True + + elif bound == 2: + if self.cells[i][j].left and self.cells[i][j].right: + if not self.cells[i][j].top and not self.cells[i][j].bottom: + self.cells[i][j].spanning_v = True + + elif self.cells[i][j].top and self.cells[i][j].bottom: + if not self.cells[i][j].left and not self.cells[i][j].right: + self.cells[i][j].spanning_h = True + + return self diff --git a/utils.py b/utils.py new file mode 100644 index 0000000..46b62cc --- /dev/null +++ b/utils.py @@ -0,0 +1,133 @@ +import numpy as np + + +def translate(x1, x2): + x2 += x1 + return x2 + + +def scale(x, s): + x *= s + return x + + +def rotate(x1, y1, x2, y2, angle): + s = np.sin(angle) + c = np.cos(angle) + x2 = translate(-x1, x2) + y2 = translate(-y1, y2) + xnew = c * x2 - s * y2 + ynew = s * x2 + c * y2 + xnew = translate(x1, xnew) + ynew = translate(y1, ynew) + return xnew, ynew + + +def remove_close_values(ar, mtol): + ret = [] + for a in ar: + if not ret: + ret.append(a) + else: + temp = ret[-1] + if np.isclose(temp, a, atol=mtol): + pass + else: + ret.append(a) + return ret + + +def merge_close_values(ar, mtol): + ret = [] + for a in ar: + if not ret: + ret.append(a) + else: + temp = ret[-1] + if np.isclose(temp, a, atol=mtol): + temp = (temp + a) / 2.0 + ret[-1] = temp + else: + ret.append(a) + return ret + + +def get_row_idx(t, rows): + for r in range(len(rows)): + if (t.y0 + t.y1) / 2.0 < rows[r][0] and (t.y0 + t.y1) / 2.0 > rows[r][1]: + return r + + +def get_column_idx(t, columns): + for c in range(len(columns)): + if (t.x0 + t.x1) / 2.0 > columns[c][0] and (t.x0 + t.x1) / 2.0 < columns[c][1]: + return c + + +def reduce_index(t, rotated, r_idx, c_idx): + if not rotated: + if t.cells[r_idx][c_idx].spanning_h: + while not t.cells[r_idx][c_idx].left: + c_idx -= 1 + if t.cells[r_idx][c_idx].spanning_v: + while not t.cells[r_idx][c_idx].top: + r_idx -= 1 + elif rotated == 'left': + if t.cells[r_idx][c_idx].spanning_h: + while not t.cells[r_idx][c_idx].left: + c_idx -= 1 + if t.cells[r_idx][c_idx].spanning_v: + while not t.cells[r_idx][c_idx].bottom: + r_idx += 1 + elif rotated == 'right': + if t.cells[r_idx][c_idx].spanning_h: + while not t.cells[r_idx][c_idx].right: + c_idx += 1 + if t.cells[r_idx][c_idx].spanning_v: + while not t.cells[r_idx][c_idx].top: + r_idx -= 1 + return r_idx, c_idx + + +def outline(t): + for i in range(len(t.cells)): + t.cells[i][0].left = True + t.cells[i][len(t.cells[i]) - 1].right = True + for i in range(len(t.cells[0])): + t.cells[0][i].top = True + t.cells[len(t.cells) - 1][i].bottom = True + return t + + +def fill(t, f): + if f == "h": + for i in range(len(t.cells)): + for j in range(len(t.cells[i])): + if t.cells[i][j].get_text().strip() == '': + if t.cells[i][j].spanning_h: + t.cells[i][j].add_text(t.cells[i][j - 1].get_text()) + elif f == "v": + for i in range(len(t.cells)): + for j in range(len(t.cells[i])): + if t.cells[i][j].get_text().strip() == '': + if t.cells[i][j].spanning_v: + t.cells[i][j].add_text(t.cells[i - 1][j].get_text()) + elif f == "hv": + for i in range(len(t.cells)): + for j in range(len(t.cells[i])): + if t.cells[i][j].get_text().strip() == '': + if t.cells[i][j].spanning_h: + t.cells[i][j].add_text(t.cells[i][j - 1].get_text()) + elif t.cells[i][j].spanning_v: + t.cells[i][j].add_text(t.cells[i - 1][j].get_text()) + return t + + +def remove_empty(d): + for i, row in enumerate(d): + if row == [''] * len(row): + d.pop(i) + d = zip(*d) + d = [list(row) for row in d if any(row)] + d = zip(*d) + return d