From f6869a9af4c410dfbdfeba1fcfd938ebe95e857e Mon Sep 17 00:00:00 2001 From: Vinayak Mehta Date: Thu, 23 Jun 2016 18:30:05 +0530 Subject: [PATCH] Improve grid detection and add more options --- .gitignore | 3 +++ README.md | 7 +++-- camelot.py | 65 +++++++++++++++++++++++++--------------------- morph_transform.py | 38 ++++++++++++--------------- spreadsheet.py | 64 +++++++++++++++++++++++++++++++-------------- table.py | 8 ++++-- 6 files changed, 111 insertions(+), 74 deletions(-) create mode 100644 .gitignore diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..fefd514 --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +__pycache__/ +*.py[cod] +.camelot/ diff --git a/README.md b/README.md index 8739e1f..b57fafe 100644 --- a/README.md +++ b/README.md @@ -12,14 +12,17 @@ optional arguments: -h, --help show this help message and exit -p PAGES [PAGES ...] Specify the page numbers and/or page ranges to be - parsed. Example: -p="1 3-5 9". (default: -p="1") + parsed. Example: -p="1 3-5 9", -p="all" (default: + -p="1") -f FORMAT Output format (csv/xlsx). Example: -f="xlsx" (default: -f="csv") -spreadsheet Extract data stored in pdfs with ruling lines. + (default: False) - -guess [Experimental] Guess the values in empty cells. + -F ORIENTATION Fill the values in empty cells. Example: -F="h", + -F="v", -F="hv" (default: None) -s [SCALE] Scaling factor. Large scaling factor leads to smaller lines being detected. (default: 15) diff --git a/camelot.py b/camelot.py index fe2457b..1da3b77 100644 --- a/camelot.py +++ b/camelot.py @@ -1,7 +1,9 @@ import os import re import glob +import time import shutil +import logging import subprocess import argparse @@ -16,62 +18,64 @@ def mkdir(directory): def filesort(filename): filename = filename.split('/')[-1] - return int(pno.findall(filename)[0]) + num = pno.findall(filename) + if len(num) == 2: + return (int(num[0]), int(num[1])) + else: + return (int(num[0]), 0) +start_time = time.time() CAMELOT_DIR = '.camelot/' mkdir(CAMELOT_DIR) parser = argparse.ArgumentParser(description='Parse yo pdf!', usage='python2 camelot.py [options] pdf_file') -parser.add_argument('-p', nargs='+', action='store', dest='pages', help='Specify the page numbers and/or page ranges to be parsed. Example: -p="1 3-5 9". (default: -p="1")') -parser.add_argument('-f', nargs=1, action='store', dest='format', help='Output format (csv/xlsx). Example: -f="xlsx" (default: -f="csv")') -parser.add_argument('-spreadsheet', action='store_true', dest='spreadsheet', help='Extract data stored in pdfs with ruling lines.') -parser.add_argument('-guess', action='store_true', dest='guess', help='[Experimental] Guess the values in empty cells.') +parser.add_argument('-p', nargs='+', action='store', dest='pages', help='Specify the page numbers and/or page ranges to be parsed. Example: -p="1 3-5 9", -p="all" (default: -p="1")') +parser.add_argument('-f', nargs=1, action='store', dest='format', help='Output format (csv/xlsx). Example: -f="xlsx" (default: -f="csv")', default=["csv"]) +parser.add_argument('-spreadsheet', action='store_true', dest='spreadsheet', help='Extract data stored in pdfs with ruling lines. (default: False)') +parser.add_argument('-F', action='store', dest='orientation', help='Fill the values in empty cells. Example: -F="h", -F="v", -F="hv" (default: None)', default=None) parser.add_argument('-s', nargs='?', action='store', dest='scale', help='Scaling factor. Large scaling factor leads to smaller lines being detected. (default: 15)', default=15, type=int) parser.add_argument('file', nargs=1) result = parser.parse_args() if result.pages: - p = [] - for r in result.pages[0].split(' '): - if '-' in r: - a, b = r.split('-') - a, b = int(a), int(b) - p.extend([str(i) for i in range(a, b + 1)]) - else: - p.extend([str(r)]) + if result.pages == ['all']: + p = result.pages + else: + p = [] + for r in result.pages[0].split(' '): + if '-' in r: + a, b = r.split('-') + a, b = int(a), int(b) + p.extend([str(i) for i in range(a, b + 1)]) + else: + p.extend([str(r)]) else: p = ['1'] p = sorted(set(p)) -if result.format: - f = result.format -else: - f = ['csv'] - -if result.spreadsheet: - s = True -else: - s = False +s = result.spreadsheet pdf_dir = os.path.join(CAMELOT_DIR, os.urandom(16).encode('hex')) mkdir(pdf_dir) filename = result.file[0].split('/')[-1] +logging.basicConfig(filename=os.path.join(pdf_dir, filename.split('.')[0] + '.log'), filemode='w', level=logging.DEBUG) + shutil.copy(result.file[0], os.path.join(pdf_dir, filename)) print "separating pdf into pages" print -for page in p: - subprocess.call(['pdfseparate', '-f', page, '-l', page, os.path.join(pdf_dir, filename), os.path.join(pdf_dir, 'pg-' + page + '.pdf')]) +if p == ['all']: + subprocess.call(['pdfseparate', os.path.join(pdf_dir, filename), os.path.join(pdf_dir, 'pg-%d.pdf')]) +else: + for page in p: + subprocess.call(['pdfseparate', '-f', page, '-l', page, os.path.join(pdf_dir, filename), os.path.join(pdf_dir, 'pg-' + page + '.pdf')]) if s: print "using the spreadsheet method" for g in sorted(glob.glob(os.path.join(pdf_dir, 'pg-*.pdf'))): print "converting", g.split('/')[-1], "to image" os.system(' '.join(['convert', '-density', '300', g, '-depth', '8', g[:-4] + '.png'])) - try: - spreadsheet(pdf_dir, g.split('/')[-1], result.guess, result.scale) - except: - pass + spreadsheet(pdf_dir, g.split('/')[-1], result.orientation, result.scale) else: print "using the basic method" for g in sorted(glob.glob(os.path.join(pdf_dir, 'pg-*.pdf'))): @@ -91,4 +95,7 @@ if result.format == ['xlsx']: xlsxpath = os.path.join(pdf_dir, xlsxname) save_data(xlsxpath, data) print - print "saved as", xlsxname \ No newline at end of file + print "saved as", xlsxname + +print "finished in", time.time() - start_time, "seconds" +logging.info("Time taken for " + filename + ": " + str(time.time() - start_time) + " seconds") diff --git a/morph_transform.py b/morph_transform.py index a0b588e..ac7d5c2 100644 --- a/morph_transform.py +++ b/morph_transform.py @@ -1,20 +1,6 @@ import cv2 -import sys -import subprocess -import matplotlib.pyplot as plt -import matplotlib.patches as patches import numpy as np -from pdfminer.pdfparser import PDFParser -from pdfminer.pdfdocument import PDFDocument -from pdfminer.pdfpage import PDFPage -from pdfminer.pdfpage import PDFTextExtractionNotAllowed -from pdfminer.pdfinterp import PDFResourceManager -from pdfminer.pdfinterp import PDFPageInterpreter -from pdfminer.pdfdevice import PDFDevice -from pdfminer.converter import PDFPageAggregator -from pdfminer.layout import LAParams, LTChar - def transform(x, y, img_x, img_y, pdf_x, pdf_y): x *= pdf_x / float(img_x) y = abs(y - img_y) @@ -27,9 +13,10 @@ def morph(imagename, p_x, p_y, s): img_x, img_y = img.shape[1], img.shape[0] pdf_x, pdf_y = p_x, p_y gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) - th1 = cv2.adaptiveThreshold(np.invert(gray), 255, cv2.ADAPTIVE_THRESH_MEAN_C, cv2.THRESH_BINARY, 15, -2) - vertical = th1 - horizontal = th1 + # empirical result taken from http://pequan.lip6.fr/~bereziat/pima/2012/seuillage/sezgin04.pdf + threshold = cv2.adaptiveThreshold(np.invert(gray), 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 15, -0.2) + vertical = threshold + horizontal = threshold scale = s verticalsize = vertical.shape[0] / scale @@ -51,15 +38,22 @@ def morph(imagename, p_x, p_y, s): tables = {} for c in contours: - x, y, w, h = cv2.boundingRect(c) - jmask = joints[y:y+h, x:x+w] - _, jc, _ = cv2.findContours(jmask, cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE) - + c_poly = cv2.approxPolyDP(c, 3, True) + x, y, w, h = cv2.boundingRect(c_poly) + # find number of non-zero values in joints using what boundingRect returns + roi = joints[y:y+h, x:x+w] + _, jc, _ = cv2.findContours(roi, cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE) if len(jc) <= 4: # remove contours with less than <=4 joints continue + joint_coords = [] + for j in jc: + jx, jy, jw, jh = cv2.boundingRect(j) + c1, c2 = x + (2*jx + jw) / 2, y + (2*jy + jh) / 2 + c1, c2 = transform(c1, c2, img_x, img_y, pdf_x, pdf_y) + joint_coords.append((c1, c2)) x1, y1 = transform(x, y, img_x, img_y, pdf_x, pdf_y) x2, y2 = transform(x + w, y + h, img_x, img_y, pdf_x, pdf_y) - tables[(x1, y2)] = (x2, y1) + tables[(x1, y2, x2, y1)] = joint_coords v_segments, h_segments = [], [] _, vcontours, _ = cv2.findContours(vertical, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) diff --git a/spreadsheet.py b/spreadsheet.py index 8a7aa41..fa79012 100644 --- a/spreadsheet.py +++ b/spreadsheet.py @@ -15,12 +15,26 @@ def remove_close_values(ar): ret.append(a) else: temp = ret[-1] - if np.isclose(temp, a, atol=1): + if np.isclose(temp, a, atol=2): pass else: ret.append(a) return ret +def merge_close_values(ar): + ret = [] + for a in ar: + if not ret: + ret.append(a) + else: + temp = ret[-1] + if np.isclose(temp, a, atol=2): + temp = (temp + a) / 2.0 + ret[-1] = temp + else: + ret.append(a) + return ret + def get_row_idx(t, rows): for r in range(len(rows)): if abs(t.y0 + t.y1) / 2.0 < rows[r][0] and abs(t.y0 + t.y1) / 2.0 > rows[r][1]: @@ -40,34 +54,46 @@ def reduce_index(t, r_idx, c_idx): r_idx -= 1 return r_idx, c_idx -def fill(t): - for i in range(len(t.cells)): - for j in range(len(t.cells[i])): - if t.cells[i][j].get_text().strip() == '': - if t.cells[i][j].spanning_h: - t.cells[i][j].add_text(t.cells[i][j - 1].get_text()) - elif t.cells[i][j].spanning_v: - t.cells[i][j].add_text(t.cells[i - 1][j].get_text()) +def fill(t, orientation): + if orientation == "h": + for i in range(len(t.cells)): + for j in range(len(t.cells[i])): + if t.cells[i][j].get_text().strip() == '': + if t.cells[i][j].spanning_h: + t.cells[i][j].add_text(t.cells[i][j - 1].get_text()) + elif orientation == "v": + for i in range(len(t.cells)): + for j in range(len(t.cells[i])): + if t.cells[i][j].get_text().strip() == '': + if t.cells[i][j].spanning_v: + t.cells[i][j].add_text(t.cells[i - 1][j].get_text()) + elif orientation == "hv": + for i in range(len(t.cells)): + for j in range(len(t.cells[i])): + if t.cells[i][j].get_text().strip() == '': + if t.cells[i][j].spanning_h: + t.cells[i][j].add_text(t.cells[i][j - 1].get_text()) + elif t.cells[i][j].spanning_v: + t.cells[i][j].add_text(t.cells[i - 1][j].get_text()) return t -def spreadsheet(pdf_dir, filename, guess, scale): +def spreadsheet(pdf_dir, filename, orientation, scale): print "working on", filename imagename = os.path.join(pdf_dir, filename.split('.')[0] + '.png') text, pdf_x, pdf_y = get_pdf_info(os.path.join(pdf_dir, filename), 'spreadsheet') tables, v_segments, h_segments = morph(imagename, pdf_x, pdf_y, scale) num_tables = 0 - for k in sorted(tables.keys(), reverse=True): + for k in sorted(tables.keys(), key=lambda x: x[1], reverse=True): # sort tables based on y-coord # find rows and columns that lie in table - lb = k - rt = tables[k] + lb = (k[0], k[1]) + rt = (k[2], k[3]) v_s = [v for v in v_segments if v[1] > lb[1] - 2 and v[3] < rt[1] + 2 and lb[0] - 2 <= v[0] <= rt[0] + 2] h_s = [h for h in h_segments if h[0] > lb[0] - 2 and h[2] < rt[0] + 2 and lb[1] - 2 <= h[1] <= rt[1] + 2] - columns = [v[0] for v in v_s] - rows = [h[1] for h in h_s] + columns, rows = zip(*tables[k]) # sort horizontal and vertical segments - columns = remove_close_values(sorted(columns)) - rows = remove_close_values(sorted(rows, reverse=True)) + columns = merge_close_values(sorted(list(columns))) + rows = merge_close_values(sorted(list(rows), reverse=True)) # make grid using x and y coord of shortlisted rows and columns columns = [(columns[i], columns[i + 1]) for i in range(0, len(columns) - 1)] rows = [(rows[i], rows[i + 1]) for i in range(0, len(rows) - 1)] @@ -89,8 +115,8 @@ def spreadsheet(pdf_dir, filename, guess, scale): r_idx, c_idx = reduce_index(table, r_idx, c_idx) table.cells[r_idx][c_idx].add_text(t.get_text().strip('\n')) - if guess: - table = fill(table) + if orientation: + table = fill(table, orientation) csvname = filename.split('.')[0] + ('_table_%d' % (num_tables + 1)) + '.csv' csvpath = os.path.join(pdf_dir, csvname) diff --git a/table.py b/table.py index 4d5f028..dd1c34e 100644 --- a/table.py +++ b/table.py @@ -11,9 +11,11 @@ class Table: for v in vertical: # find closest x coord # iterate over y coords and find closest points - i = [i for i, t in enumerate(self.columns) if np.isclose(v[0], t[0])] + i = [i for i, t in enumerate(self.columns) if np.isclose(v[0], t[0], atol=2)] j = [j for j, t in enumerate(self.rows) if np.isclose(v[3], t[0], atol=2)] k = [k for k, t in enumerate(self.rows) if np.isclose(v[1], t[0], atol=2)] + if not j: + continue if i == [0]: # only left edge if k: I = i[0] @@ -65,9 +67,11 @@ class Table: for h in horizontal: # find closest y coord # iterate over x coords and find closest points - i = [i for i, t in enumerate(self.rows) if np.isclose(h[1], t[0])] + i = [i for i, t in enumerate(self.rows) if np.isclose(h[1], t[0], atol=2)] j = [j for j, t in enumerate(self.columns) if np.isclose(h[0], t[0], atol=2)] k = [k for k, t in enumerate(self.columns) if np.isclose(h[2], t[0], atol=2)] + if not j: + continue if i == [0]: # only top edge if k: I = i[0]