From eef07a86c63049a12d083769e0679c705fe6e27b Mon Sep 17 00:00:00 2001 From: Vinayak Mehta Date: Sat, 18 Jun 2016 17:20:42 +0530 Subject: [PATCH] First commit :fire: --- README.md | 27 ++++++++ basic.py | 58 +++++++++++++++++ basic.pyc | Bin 0 -> 2462 bytes camelot.py | 94 ++++++++++++++++++++++++++++ cell.py | 23 +++++++ cell.pyc | Bin 0 -> 1390 bytes morph_transform.py | 79 ++++++++++++++++++++++++ morph_transform.pyc | Bin 0 -> 3446 bytes pdf.py | 54 ++++++++++++++++ pdf.pyc | Bin 0 -> 2442 bytes spreadsheet.py | 103 +++++++++++++++++++++++++++++++ spreadsheet.pyc | Bin 0 -> 4601 bytes table.py | 147 ++++++++++++++++++++++++++++++++++++++++++++ table.pyc | Bin 0 -> 4128 bytes 14 files changed, 585 insertions(+) create mode 100644 README.md create mode 100644 basic.py create mode 100644 basic.pyc create mode 100644 camelot.py create mode 100644 cell.py create mode 100644 cell.pyc create mode 100644 morph_transform.py create mode 100644 morph_transform.pyc create mode 100644 pdf.py create mode 100644 pdf.pyc create mode 100644 spreadsheet.py create mode 100644 spreadsheet.pyc create mode 100644 table.py create mode 100644 table.pyc diff --git a/README.md b/README.md new file mode 100644 index 0000000..8739e1f --- /dev/null +++ b/README.md @@ -0,0 +1,27 @@ +Camelot +------- + +usage: python2 camelot.py [options] pdf_file + +Parse yo pdf! + +positional arguments: + file + +optional arguments: + -h, --help show this help message and exit + + -p PAGES [PAGES ...] Specify the page numbers and/or page ranges to be + parsed. Example: -p="1 3-5 9". (default: -p="1") + + -f FORMAT Output format (csv/xlsx). Example: -f="xlsx" (default: + -f="csv") + + -spreadsheet Extract data stored in pdfs with ruling lines. + + -guess [Experimental] Guess the values in empty cells. + + -s [SCALE] Scaling factor. Large scaling factor leads to smaller + lines being detected. (default: 15) + +Under construction... \ No newline at end of file diff --git a/basic.py b/basic.py new file mode 100644 index 0000000..bad348c --- /dev/null +++ b/basic.py @@ -0,0 +1,58 @@ +import os +import csv +import numpy as np + +from pdf import get_pdf_info + +def overlap(l): + merged = [] + for higher in l: + if not merged: + merged.append(higher) + else: + lower = merged[-1] + if higher[0] >= lower[0] and higher[1] <= lower[1]: + upper_bound = max(lower[1], higher[1]) + lower_bound = min(lower[0], higher[0]) + merged[-1] = (lower_bound, upper_bound) + else: + merged.append(higher) + return merged + +def get_row_idx(t, rows): + for r in range(len(rows)): + if t.y1 <= rows[r][0] and t.y0 >= rows[r][1]: + return r + +def get_column_idx(t, columns): + for c in range(len(columns)): + if t.x0 >= columns[c][0] and t.x1 <= columns[c][1]: + return c + +def basic(pdf_dir, filename): + print "working on", filename + text, _, _ = get_pdf_info(os.path.join(pdf_dir, filename), 'basic') + rows, columns = [], [] + for t in text: + rows.append((t.y1, t.y0)) + columns.append((t.x0, t.x1)) + rows = list(set(rows)) + rows = sorted(rows, reverse=True) + columns = list(set(columns)) + columns = sorted(columns) + columns = overlap(columns) + table = [['' for c in columns] for r in rows] + for t in text: + r_idx = get_row_idx(t, rows) + c_idx = get_column_idx(t, columns) + if None in [r_idx, c_idx]: + print t + else: + table[r_idx][c_idx] = t.get_text().strip('\n') + + csvname = filename.split('.')[0] + '.csv' + csvpath = os.path.join(pdf_dir, csvname) + with open(csvpath, 'w') as outfile: + writer = csv.writer(outfile, quoting=csv.QUOTE_ALL) + for cell in table: + writer.writerow([ce for ce in cell]) \ No newline at end of file diff --git a/basic.pyc b/basic.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9309a541ed490776e23bb271d71a5bfc2a16abe1 GIT binary patch literal 2462 zcmb_d-EJFI5T3LCN$lA9ZJI;~C^z5&iqkuUA}Fd+dJzS>E(!__mb1HY)>-e`yT^$u z*;nD77w9D~zyt6O-0%XtLB#{W_su30DkLr%d(Z5gIp@ro`Q~Q}f7Gil?+1sTT>l)r zA7W(BKzuw#B9WuNc_T;WM()T_L3u}_q6|uMR1)JC7=%04N_%!h-En}209${oZgY-mZMkYVWRHSwMLS~Lkjb!&p z`yx|Eu8f#bopW5XH7ii&JF+!}t$3Z6Bz>W>LN3GW+)!qXa1X^ef!CEIv!YnskRQEP zL~%)`c#3kW*W;qy{K|isa0naPjUuc*P)9>>64I?k0mRa|?r<2yzU6$-o#gK@Zg2_q zh(x+;5Tw1pSN1IIodu~SC{c0=ZUNjFTGGx*GKMP7btRuSFcl@)r{J1r$slN6B&iK1 z&3592-RMy=%$mbA=?9(#>(BQ9_J~9d|_^&;E_mL-=NP5%qCPk z9`p%34(b=RI(f*f(QGTTYWqc*mZgsh1}(FQ*)&km*9&vHFn_Z!YRRFkh^_K$p1JG` z3!I`)pmSXrLrK1;bhD?VdvppTEPB!n57ThAb^NZ(=4dAtTE{h+p+^?wdq+mUEB#Go z6`6Rs@!!d;EOwEm(F{0R6}zNUsKs=Fy?6>E7o_LNd0EoW4L0vR9vQy-D{_U3T4g_m z6VnR!ZejB-*4e81YwXZi^5;-*^Rh9ku?M|^8h+xMy4X81t&3mb^vADdwkUR8`gdiz zC_ZcgzoCdM$z+@Nw2t4G*|JQRq`#@&F6Z8^$aF>h{M8Wu(J$tZRsG~!kamMvdXQB? zFO&2IMJun(47MdDcSaLD) z3P>{T1H-O8Q>LGUu}gJsorD?du94gU(H&8kS#2)3T)T3{hU)9u({voTG}~noyBjxW z4w5*q^O!OF(S5Btli4&J=G-s}t!hZ{Nl@)Ho?YWI4J|r73*yN)Up@V@^TqzYs>&CV z%LcDf5Bc@dmkGQ0`4Xr%Z+2 ZLE?|2;4{W0W1i80nzQUwmMW!6=`Vuq-r)cM literal 0 HcmV?d00001 diff --git a/camelot.py b/camelot.py new file mode 100644 index 0000000..fe2457b --- /dev/null +++ b/camelot.py @@ -0,0 +1,94 @@ +import os +import re +import glob +import shutil +import subprocess +import argparse + +from basic import basic +from spreadsheet import spreadsheet + +pno = re.compile(r'\d+') + +def mkdir(directory): + if not os.path.isdir(directory): + os.makedirs(directory) + +def filesort(filename): + filename = filename.split('/')[-1] + return int(pno.findall(filename)[0]) + +CAMELOT_DIR = '.camelot/' +mkdir(CAMELOT_DIR) + +parser = argparse.ArgumentParser(description='Parse yo pdf!', usage='python2 camelot.py [options] pdf_file') +parser.add_argument('-p', nargs='+', action='store', dest='pages', help='Specify the page numbers and/or page ranges to be parsed. Example: -p="1 3-5 9". (default: -p="1")') +parser.add_argument('-f', nargs=1, action='store', dest='format', help='Output format (csv/xlsx). Example: -f="xlsx" (default: -f="csv")') +parser.add_argument('-spreadsheet', action='store_true', dest='spreadsheet', help='Extract data stored in pdfs with ruling lines.') +parser.add_argument('-guess', action='store_true', dest='guess', help='[Experimental] Guess the values in empty cells.') +parser.add_argument('-s', nargs='?', action='store', dest='scale', help='Scaling factor. Large scaling factor leads to smaller lines being detected. (default: 15)', default=15, type=int) +parser.add_argument('file', nargs=1) + +result = parser.parse_args() + +if result.pages: + p = [] + for r in result.pages[0].split(' '): + if '-' in r: + a, b = r.split('-') + a, b = int(a), int(b) + p.extend([str(i) for i in range(a, b + 1)]) + else: + p.extend([str(r)]) +else: + p = ['1'] +p = sorted(set(p)) + +if result.format: + f = result.format +else: + f = ['csv'] + +if result.spreadsheet: + s = True +else: + s = False + +pdf_dir = os.path.join(CAMELOT_DIR, os.urandom(16).encode('hex')) +mkdir(pdf_dir) +filename = result.file[0].split('/')[-1] +shutil.copy(result.file[0], os.path.join(pdf_dir, filename)) +print "separating pdf into pages" +print +for page in p: + subprocess.call(['pdfseparate', '-f', page, '-l', page, os.path.join(pdf_dir, filename), os.path.join(pdf_dir, 'pg-' + page + '.pdf')]) + +if s: + print "using the spreadsheet method" + for g in sorted(glob.glob(os.path.join(pdf_dir, 'pg-*.pdf'))): + print "converting", g.split('/')[-1], "to image" + os.system(' '.join(['convert', '-density', '300', g, '-depth', '8', g[:-4] + '.png'])) + try: + spreadsheet(pdf_dir, g.split('/')[-1], result.guess, result.scale) + except: + pass +else: + print "using the basic method" + for g in sorted(glob.glob(os.path.join(pdf_dir, 'pg-*.pdf'))): + basic(pdf_dir, g.split('/')[-1]) + +if result.format == ['xlsx']: + import csv + from pyexcel_xlsx import save_data + from collections import OrderedDict + data = OrderedDict() + for c in sorted(glob.glob(os.path.join(pdf_dir, '*.csv')), key=filesort): + print "adding", c.split('/')[-1], "to excel file" + with open(c, 'r') as csvfile: + reader = csv.reader(csvfile) + data.update({c.split('/')[-1].split('.')[0]: [row for row in reader]}) + xlsxname = filename.split('.')[0] + '.xlsx' + xlsxpath = os.path.join(pdf_dir, xlsxname) + save_data(xlsxpath, data) + print + print "saved as", xlsxname \ No newline at end of file diff --git a/cell.py b/cell.py new file mode 100644 index 0000000..6e21ce9 --- /dev/null +++ b/cell.py @@ -0,0 +1,23 @@ +class Cell: + def __init__(self, x1, y1, x2, y2): + self.lb = (x1, y1) + self.lt = (x1, y2) + self.rb = (x2, y1) + self.rt = (x2, y2) + self.bbox = (x1, y1, x2, y2) + self.left = False + self.right = False + self.top = False + self.bottom = False + self.text = '' + self.spanning_h = False + self.spanning_v = False + + def add_text(self, text): + self.text += text + + def get_text(self): + return self.text + + def get_bounded_edges(self): + return self.top + self.bottom + self.left + self.right \ No newline at end of file diff --git a/cell.pyc b/cell.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c5344e651579f405154f8c7bebbf77274bbdaf7c GIT binary patch literal 1390 zcmb_cOK;Oa5FS60)@e&YP{IN3J|HZ5sgMvtMB*3;=YYs%<;UhxIo>F{tx_pB_zC<| z{s7;M?GU(T-{XJ@{hoi_M&G5GRIe=Vr}1dt!G>=01m7f|WZ52Eix-=?sAn#3XY z9K%Rg)&8gl$0scN8lZ`$9w)%jso&<9BAG9qkQ4J(kJ2Vz(ZeZTl9$u`1nfN8`84xs zN2D4v>`ZJ02fK)Yx!;8@XIp0|8A5VBQTm;7(kZ8%5;0jWH|RWPopR0>n#4YcjU9mE z3xH$0${XQCY#M9hq)T~jwx~<=IjgJ|SEXl8T$|;5;T(6}nYHUf=Ega*k|?LQ&JEzR z>#VMq^;|9fHt%CjE3sBrGpW6OE8?LM+v~=!nkQZ*E@z$h&O6S6+1REjWS~-*ALw zp90X`@K#>)#>cBlQ+ayEF=KiTkS3>>dlSmh*k@&_xV4Xw(=s{o3NG9&;dufFc!K6c zSRHZ`mdx#_O`U5ew75h6(qfiE>*bynlk;fE!f!-C!;UNe5v4pi=@|pRSsDSF1|OCB z6QpJ8xw)%LT`FD9wS5j}dxw~Zx}BODv!s;Fuu>~i-c>NuX9!5I82DJq|9Sp2Rc m*%S%oaV__u??sQIAPUA!4S7orIBH3`@B8M4AGn0&62Adh=MT>S literal 0 HcmV?d00001 diff --git a/morph_transform.py b/morph_transform.py new file mode 100644 index 0000000..a0b588e --- /dev/null +++ b/morph_transform.py @@ -0,0 +1,79 @@ +import cv2 +import sys +import subprocess +import matplotlib.pyplot as plt +import matplotlib.patches as patches +import numpy as np + +from pdfminer.pdfparser import PDFParser +from pdfminer.pdfdocument import PDFDocument +from pdfminer.pdfpage import PDFPage +from pdfminer.pdfpage import PDFTextExtractionNotAllowed +from pdfminer.pdfinterp import PDFResourceManager +from pdfminer.pdfinterp import PDFPageInterpreter +from pdfminer.pdfdevice import PDFDevice +from pdfminer.converter import PDFPageAggregator +from pdfminer.layout import LAParams, LTChar + +def transform(x, y, img_x, img_y, pdf_x, pdf_y): + x *= pdf_x / float(img_x) + y = abs(y - img_y) + y *= pdf_y / float(img_y) + return x, y + +# http://answers.opencv.org/question/63847/how-to-extract-tables-from-an-image/ +def morph(imagename, p_x, p_y, s): + img = cv2.imread(imagename) + img_x, img_y = img.shape[1], img.shape[0] + pdf_x, pdf_y = p_x, p_y + gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) + th1 = cv2.adaptiveThreshold(np.invert(gray), 255, cv2.ADAPTIVE_THRESH_MEAN_C, cv2.THRESH_BINARY, 15, -2) + vertical = th1 + horizontal = th1 + + scale = s + verticalsize = vertical.shape[0] / scale + horizontalsize = horizontal.shape[1] / scale + + ver = cv2.getStructuringElement(cv2.MORPH_RECT, (1, verticalsize)) + hor = cv2.getStructuringElement(cv2.MORPH_RECT, (horizontalsize, 1)) + + vertical = cv2.erode(vertical, ver, (-1, -1)) + vertical = cv2.dilate(vertical, ver, (-1, -1)) + + horizontal = cv2.erode(horizontal, hor, (-1, -1)) + horizontal = cv2.dilate(horizontal, hor, (-1, -1)) + + mask = vertical + horizontal + joints = np.bitwise_and(vertical, horizontal) + _, contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) + contours = sorted(contours, key=cv2.contourArea, reverse=True)[:10] + + tables = {} + for c in contours: + x, y, w, h = cv2.boundingRect(c) + jmask = joints[y:y+h, x:x+w] + _, jc, _ = cv2.findContours(jmask, cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE) + + if len(jc) <= 4: # remove contours with less than <=4 joints + continue + x1, y1 = transform(x, y, img_x, img_y, pdf_x, pdf_y) + x2, y2 = transform(x + w, y + h, img_x, img_y, pdf_x, pdf_y) + tables[(x1, y2)] = (x2, y1) + + v_segments, h_segments = [], [] + _, vcontours, _ = cv2.findContours(vertical, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) + for vc in vcontours: + x, y, w, h = cv2.boundingRect(vc) + x1, y1 = transform(x, y, img_x, img_y, pdf_x, pdf_y) + x2, y2 = transform(x + w, y + h, img_x, img_y, pdf_x, pdf_y) + v_segments.append(((x1 + x2) / 2, y2, (x1 + x2) / 2, y1)) + + _, hcontours, _ = cv2.findContours(horizontal, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) + for hc in hcontours: + x, y, w, h = cv2.boundingRect(hc) + x1, y1 = transform(x, y, img_x, img_y, pdf_x, pdf_y) + x2, y2 = transform(x + w, y + h, img_x, img_y, pdf_x, pdf_y) + h_segments.append((x1, (y1 + y2) / 2, x2, (y1 + y2) / 2)) + + return tables, v_segments, h_segments \ No newline at end of file diff --git a/morph_transform.pyc b/morph_transform.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f75a2d34873f08e20a19eb152034766fe2a13580 GIT binary patch literal 3446 zcmcH*%WfM-vU;c|MT&Zuk|o8lY%7U^AQ9qQfFQdx#f=46644@Jhj2iQW?JOX^U6$< zCWD!il0V2XxBZd*l|Ak`w;YnKs^-WFoXY}Ua;B%My1L%o)umsn<-dKRo;xu6S@``s zzUgl`xcCV00lGJ^fUYR$7IduwF2QgPy7L9P0K-M-E*9t#be9>nz%N6v0^Jo5B)tTF z1%fJcs|KF~e-(lnbZZ8mXM7F1YX)Bce;tAi=x!K%k?~FFZW??E{5ufTp<6fjGWc5% zY(sb3;AQadLa+nf9SC-zyNlEn_z@l41GmD!d*t%PVHMr?{=nbap+HbCoh0rL^K!z5(S2G$x(?etf9@1lkjs{Vnj>b`< z)#T_Ray;8_Msa!+C(%$j8Ugp}NUKyIId-7@NFN1J5)XQqi7>s2lHjv=%E`SUP`pzL z6^J`x4PS@iLK=z4@i{~aMrO#~QYhvqRzutj0(NGQ1v^$g5BUOz%mOQ4gnWsYm1dE} z9dm6#zRX07BX62)nL>#hgA^Ek+&XjM&cpQrd;TVW!rBx3az5jpW$2X~c; zm4ed{YcnEAtH5@?!S14|jFTywyr2ddPND0@vUOVa5*6QpyiTLufUFMgBJ*s5ixIkm z0khm?*hdZzAm4(?e~9V$yMIHz&5U)(wkfPH=ODie`3~f}kncgZ3E2)md$9eJ1=(GG zb{WAi?BwqiMj2p>(c4DdW23eqyT@rO!z%&Ff%W}AMTj+GzC?H5 zhx~rQfC|YDG~zE9_aT1(**;|VDKps?WbZ@vfZ#7t%kw?}K34t^@<)ZSCQENe@PLe+ z`~ukl2IptUKPZCp!CwUD9UA`-hUgxey${=X-qz9l+ZZ~AJ3)Z>{$JK}$~_i=DWA7p z_^@!{L*v3ng$p13e_eP~xNx6>Z@l1~6jU zXsFTfJGK|@F?=Y#Qd3RcBT-|8O~xw|3mrWk9o~G5GBXeV8@h`EScfxy%m=?0B+7RA zpiKvMthhBeW8I8=JV{rPy?OrZT=t%PEgyd^8{cZ`DzoqlX)EVW;$yq~wtLsSVD}AApk)ub67xr7e zqE@QO_tSIPe%h0*W{2%mN#rV4<$AuYm8SN4>FL*As(NI&jlUg$QXP-7QOS)puo zWUuw2(~@V6XL=J^nok=iXT3(dEze){E>2F{&sr=sr556{RL4kYV2m^+pp%Rgi@%IU zp^Iip>?~{}i*Gj1Pum<`Uxkv|2Me_0ScUE(H4bhtUVt4u#3sb7G4?cSg>i4n{V!z< z`=6xfOR5h^l_M$gY(+n@r_7`WpL4z_;GSdqY{?)>ydThRM5&RcNH;bYT&eei;>^C` zWD*%cJoLpWRu$OkEB0X+d7)0(z@Dj*BDmbX?Moj!E0wk*Dg2Uh%=l{_2W-ob#Zj#c z9WI&4=lq$P&*ZVe9{?aC ziAva~)@}Ik9?F)qH8>^d)C3|OUE-C-QEAG1F|c)vml@x?#QO_DoUR$jb>(Ypm1y)W zMq4`r%oiOp@eC%v!ch=UCEZhb1zwl}FI35A2*iBdl{iIh7MI7FL3 wTzE6eHbQS|f}wYJbtGM129Z1R)mIdVbRUO`pnnbj6>AOekhlL^*7D+i0lBC0Pyhe` literal 0 HcmV?d00001 diff --git a/pdf.py b/pdf.py new file mode 100644 index 0000000..d210953 --- /dev/null +++ b/pdf.py @@ -0,0 +1,54 @@ +from pdfminer.pdfparser import PDFParser +from pdfminer.pdfdocument import PDFDocument +from pdfminer.pdfpage import PDFPage +from pdfminer.pdfpage import PDFTextExtractionNotAllowed +from pdfminer.pdfinterp import PDFResourceManager +from pdfminer.pdfinterp import PDFPageInterpreter +from pdfminer.pdfdevice import PDFDevice +from pdfminer.converter import PDFPageAggregator +from pdfminer.layout import LAParams, LTChar, LTTextLineHorizontal + +text = [] + +def parse_text_basic(layout): + global text + try: + for obj in layout._objs: + if type(obj) is LTTextLineHorizontal: + text.append(obj) + parse_text_basic(obj) + except AttributeError: + pass + +def parse_text_spreadsheet(layout): + global text + try: + for obj in layout._objs: + if type(obj) is LTChar: + text.append(obj) + parse_text_spreadsheet(obj) + except AttributeError: + pass + +def get_pdf_info(pdfname, method): + global text + with open(pdfname, 'r') as f: + parser = PDFParser(f) + document = PDFDocument(parser) + if not document.is_extractable: + raise PDFTextExtractionNotAllowed + laparams = LAParams() + rsrcmgr = PDFResourceManager() + device = PDFPageAggregator(rsrcmgr, laparams=laparams) + interpreter = PDFPageInterpreter(rsrcmgr, device) + for page in PDFPage.create_pages(document): + interpreter.process_page(page) + layout = device.get_result() + text = [] + if method == 'basic': + parse_text_basic(layout) + elif method == 'spreadsheet': + parse_text_spreadsheet(layout) + pdf_x, pdf_y = layout.bbox[2], layout.bbox[3] + text.sort(key=lambda x: (-x.y0, x.x0)) + return text, pdf_x, pdf_y \ No newline at end of file diff --git a/pdf.pyc b/pdf.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8c65f477ed782afd85a4656feecc430c7263b842 GIT binary patch literal 2442 zcmc&#TW=dh6#iz{_ePAeAU=D!H<#to82LS!XZq%sAM{ z782o|U&KTI1b>8Q1n0~+mr~w{M4XvD=gjTQIp252|D)Y_`>7m6*#3F+{gRq~OI=L= z0x6IQ32$A883$8N;&EOIEL7SaEt8X%$%w!)21;RbV4g3NO?;>tm;U40a749Q$O{FtDbcUC5X`U~Q3L}#g`B7mG z(zN(N#%|J{n-oZ0EL0@F4fAk%dCYd~m@MDqMyk1z)Vp^O%FDykP?>vHWDj2?x%|3N$&W>D!ZhNWb6U&P4yozysay52 z`~quW2J~NlhK+y`SPNW`;Nh$W^#sPpth67f?7B2u6$?XN0(3-|KCqqsoTT1qk;&e9p=h(-Nf9MsdRWYLZ?1}&j0_3! zq-Ufyy(r9NS{ODwny-Q#THL0l`Dl2)=!iYr+TwqWWx!AkTHG;W0`@UrAI$^mZUjyf zxWZVUN@)U)sSQ}bkO~-vw-u+mwATmJ{w+?NBKD8LYBBeaP-wt_ql$9j;nzR0@hs## zhTO(y)UACh6XZ&m`Or@pCqD}~=v@GtGAEve3QQI23fH4`nO2ESm8n6Nb(Qzu)Udo@ z!p|DK8y(Sqt+1(MUBj%2b)8Lu=FVDnzXrIVv>D&Ak24SIHxTyL4LdL2w*w_R@H@=T z)c{?ofmsJ}8T(tQf_-n(#HNKeKVe|nnC&92BCc^^x*(zqh$P>=qJcFz(!#_i%5X-H zh=G8z3{OK!C^MW1DivJ@Eg_3`i6=SL1zTt6b6UhUI}>ZLWOIp%w+}~f|8k|$D$1tHPK<3$H=MfU zy6oHI*kMabn_@a{w7im5w-tYm2VbAJPx49e3F+Ey?(K;d{p#D_p4br&M4K!XYW!`B zkHx+?>T+>0oH5!XqnAZJBEbc~rp`V*H^;6bVsmPX`a3J>+MGO_+V?hgQF6)VJij-= zMKs`t@7=Me$j>Dedu2gF6usfGdng=^Z0%%N`~wU@k8Ia*x6*%lNF4aHB3`8O8OK6@ TNL`1XlAFJeyoy(E)Ej>TvIhDk literal 0 HcmV?d00001 diff --git a/spreadsheet.py b/spreadsheet.py new file mode 100644 index 0000000..8a7aa41 --- /dev/null +++ b/spreadsheet.py @@ -0,0 +1,103 @@ +import os +import csv +import glob +import numpy as np +import matplotlib.pyplot as plt + +from table import Table +from pdf import get_pdf_info +from morph_transform import morph + +def remove_close_values(ar): + ret = [] + for a in ar: + if not ret: + ret.append(a) + else: + temp = ret[-1] + if np.isclose(temp, a, atol=1): + pass + else: + ret.append(a) + return ret + +def get_row_idx(t, rows): + for r in range(len(rows)): + if abs(t.y0 + t.y1) / 2.0 < rows[r][0] and abs(t.y0 + t.y1) / 2.0 > rows[r][1]: + return r + +def get_column_idx(t, columns): + for c in range(len(columns)): + if abs(t.x0 + t.x1) / 2.0 > columns[c][0] and abs(t.x0 + t.x1) / 2.0 < columns[c][1]: + return c + +def reduce_index(t, r_idx, c_idx): + if t.cells[r_idx][c_idx].spanning_h: + while not t.cells[r_idx][c_idx].left: + c_idx -= 1 + if t.cells[r_idx][c_idx].spanning_v: + while not t.cells[r_idx][c_idx].top: + r_idx -= 1 + return r_idx, c_idx + +def fill(t): + for i in range(len(t.cells)): + for j in range(len(t.cells[i])): + if t.cells[i][j].get_text().strip() == '': + if t.cells[i][j].spanning_h: + t.cells[i][j].add_text(t.cells[i][j - 1].get_text()) + elif t.cells[i][j].spanning_v: + t.cells[i][j].add_text(t.cells[i - 1][j].get_text()) + return t + +def spreadsheet(pdf_dir, filename, guess, scale): + print "working on", filename + imagename = os.path.join(pdf_dir, filename.split('.')[0] + '.png') + text, pdf_x, pdf_y = get_pdf_info(os.path.join(pdf_dir, filename), 'spreadsheet') + tables, v_segments, h_segments = morph(imagename, pdf_x, pdf_y, scale) + + num_tables = 0 + for k in sorted(tables.keys(), reverse=True): + # find rows and columns that lie in table + lb = k + rt = tables[k] + v_s = [v for v in v_segments if v[1] > lb[1] - 2 and v[3] < rt[1] + 2 and lb[0] - 2 <= v[0] <= rt[0] + 2] + h_s = [h for h in h_segments if h[0] > lb[0] - 2 and h[2] < rt[0] + 2 and lb[1] - 2 <= h[1] <= rt[1] + 2] + columns = [v[0] for v in v_s] + rows = [h[1] for h in h_s] + # sort horizontal and vertical segments + columns = remove_close_values(sorted(columns)) + rows = remove_close_values(sorted(rows, reverse=True)) + # make grid using x and y coord of shortlisted rows and columns + columns = [(columns[i], columns[i + 1]) for i in range(0, len(columns) - 1)] + rows = [(rows[i], rows[i + 1]) for i in range(0, len(rows) - 1)] + + table = Table(columns, rows) + # pass row and column line segments to table method and light up cell edges + table = table.set_edges(v_s, h_s) + # table set span method + table = table.set_spanning() + # fill text after sorting it + text.sort(key=lambda x: (-x.y0, x.x0)) + + for t in text: + r_idx = get_row_idx(t, rows) + c_idx = get_column_idx(t, columns) + if None in [r_idx, c_idx]: + pass + else: + r_idx, c_idx = reduce_index(table, r_idx, c_idx) + table.cells[r_idx][c_idx].add_text(t.get_text().strip('\n')) + + if guess: + table = fill(table) + + csvname = filename.split('.')[0] + ('_table_%d' % (num_tables + 1)) + '.csv' + csvpath = os.path.join(pdf_dir, csvname) + with open(csvpath, 'w') as outfile: + writer = csv.writer(outfile, quoting=csv.QUOTE_ALL) + for i in range(len(table.cells)): + writer.writerow([table.cells[i][j].get_text().strip().encode('utf-8') for j in range(len(table.cells[i]))]) + print "saved as", csvname + print + num_tables += 1 \ No newline at end of file diff --git a/spreadsheet.pyc b/spreadsheet.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8001f735317ba6359f7a45f543558403044fa14d GIT binary patch literal 4601 zcmcIo&2L;u5wCvZuklCx?U_m9Y#?OUArS8#SRxUutYWk)6dCfI#bB)&jpsevGk!B~ z=5^a1OY22D4N>UC{5iG6nx-Tl~AJJDXX(cVihiCSsaIjD2Z5XuP3 z3lvK{_C<0dz3)k1WPN8t@<94V>?77qk$xa&Mof2#eF#y>3*s;Xssbk?{1xOZFxiH9 zqHBw`uxfLKn{KG!j?J1&lLm!ZDWGx`r`GAj?Z{Ct?T#D8r(x?!YGYli>{CLe*=FgM^020Ym()?`#ua90y=GfgkE2DPac4E`khzR)A>b5E9IP9$f)+L!`s1 z&F;KxMKZt|O4WbQ8~k{zX)J0nS*;B#p}0xh)^BasD#WMn`p<>GQW$$4&7MayoaHzz zyPhUTt+vOre$#kzcjD&x^c|dPOaT; zwD+qAYLH31=dO<*t0lX%qqghOrpaf-K<&QH)=lG0y=QTBRFC0uqqz4)%1R}OSuiCt zdtTRhmx%g>@BIfly}u4T1mIUkt4`w2>XH9+ZH}tJq|X0VowP2w&=`EMIuq3JC5M4z z@2mFIn|q}WOZKZb$40IX!1t6Za7^HWolN&H*QYDZQOQ0tc7=zoR|~qzqx-(#Z>zf` zTd=cDRnP`p^c~-A31OjCKqs?o@l$WsOdFXE0ZbBAp9q-d9vkoEz$@nm4U# zI`_Rsl03%O&Y%)=*W5P`pyGE?zG3R5cngI_@dCwu8zWH1hfvdpAK}EfSd5%{(*QHz z0?de?m55;=kbFpB-fs|M961B(PYRL`OFkm`C_Q<9NczBjl#%#sNN2|+FG@a6midJA z{!l=0?T-o6M!OgxtY^lo@BF5aPfCAMa)k9UHY)ioW&-^pVpJmewDdJCsr09qom7yR zlo@M2)$t;{1kx8jUZShSI0$(T5OOJ+Lxn>l>8v5CpDAIW9SCv-JNV)KOb4RgqhaBoMLdoEZbug_Z=`TqW3ZIew z+?A1A0^8s$5?W{uvQ9APJQHJjKjP^6PEYcZIIJrTn3cr=6m)?AcvqMO_si0+$bKM) zBa;0y=>9%HDuq4$_*p?d0cDxQ6a*5NaqJ4B4L~SKbBS^^OA<|n zJOEu1MjTwDYqtldnxO7WOQqXg0#AjsV1geb z8l_3Hp|LugMl)PQ<=+{cx$*GIFz_s?wRWT0+AFjzidb$nZi}d^j?bU!!^oe#ZjTj@ z6FqVFD0KkIG7UI#dtZHO$uVpvnM6i+5qA5YD^;v{RhIMYpEef*u*->QD_@}&k+Karj&Rot$p zQS498bd;~fsF5il!du+Nt4L43k4KO7IMi!Ntb7}-+P}t;w%JZ$8aQ^iM(TG^-EiIzei=N!hGJi$5>qs9n|I8;V8pzI zGHy=sTQsls9907c@Ip( zZ=zD8*iX`3#Q~^xPD4e6H@Q`F9lUju#x9<|=*1uR%sUBLoa@(I(OwX^(~nwo;*i=n+)%WXyu-Ol`+#KGezYcn?BOiJuI|HJt9w0kH0+7 zz{G$L5~~Kz%K;M#AbQ1S;$kYg)mh2J6nR>Y!p&AY5!K`9#{WtRBinhO{gXAMwd zq4f$6Nl5ZKkE-iY%dc)lvGKR7m!f)53(rQKq}qw24Zm)9&{(S)KQYyMt>uT2sT!E} zh0b*|!^b@@Xa~mgz&L?NNG+>bCAORA8J-c_eZ!mEZ&|GHxhrp2sqJ7fq76kmSbd&8 z)Uac}Kt|JsMKB{9uE)-Y{WQhLU}0GC7#?I-3=r%T;YC4k#u4Cw6aj(6Pihr=)N=&K zqrS5jPH`Nkkb`rRdPTvxNqu&yc)zLW8jJd!;;s0C8iYiD1}vdRe-?*iRHE(AsK@gX zU1R^8QS4qA#KB^@H$!EQQD+hUukx|(6KbV_m2I@}j}QkjK}GHd(czotbQR^YbIvsA|R zKV{_pEeE%VbQ|*rtWfsG^!X~~mnQz0g(lUwkYRV;nuLa{C)aY=kp=ZukV&XQoJn)j z@jIPKbBLi~9w3vN2qR=t(;Ca9K`u^b5+jUqag<57tU)H-vc@uL3@4jOjAQRgGQzoM zv11_5J!>+P{7ON0sEWs1B=x8skO&@L0|B z-s71)j0bx$z^|6lYvYzoi8_;PJoWQT3fSbWc5`=J+TTs7pra!>aaZU3dS zdwjhq5BGo7eYw=}5iE1PF7u^khj4LBuQq75K6PKgH@!1r<)}{i>>jcyxijFsyH=pM zJo%iHv&Pw59yaok9NfaXW!<%`2~Gt{&Z~(vI|udQIXRSk{~(<5^xQ+IQoc?_N~ZM8 z(4cunQt%+SI;j!x?X7crS)^7UlB2UxOcaTR-t`k?I z&340Yr1ipm4EqN_u5CAO(>(&RgcFzw_@k?BD8Q&~ zLqV=%{xTC%N7Zrlq!h4uE@h|GQ&aR9?^e{ayaW1-N?*!Q$vv;l?@*p6T6tb8YHWr) zb+P9@4RVsnNDUc1WSw@0B~9MW95q{*jZzNMU_iK`b*{VOKsXhzm-4bHM<1;;9MD9k PQpHb0vIHFG)dBr4?U~R@ literal 0 HcmV?d00001