Make present code work with new API

pull/2/head
Vinayak Mehta 2018-09-04 23:34:49 +05:30
parent e3aabb720f
commit 941994f0bf
5 changed files with 91 additions and 84 deletions

View File

@ -1,3 +1,3 @@
from .__version__ import __version__ from .__version__ import __version__
from .io import * from .io import read_pdf

View File

@ -1,11 +1,5 @@
import os
import tempfile
import numpy as np import numpy as np
import pandas as pd import pandas as pd
from PyPDF2 import PdfFileReader, PdfFileWriter
from .utils import get_page_layout, get_text_objects, get_rotation
class Cell(object): class Cell(object):
@ -239,75 +233,3 @@ class TableSet(object):
def __repr__(self): def __repr__(self):
pass pass
class FileHandler(object):
def __init__(self, filename, pages='1'):
self.filename = filename
if not self.filename.endswith('.pdf'):
raise TypeError("File format not supported.")
self.pages = __get_pages(pages)
self.temp = tempfile.mkdtemp()
@staticmethod
def __get_pages(filename, pages):
p = {}
if pages == '1':
p.append({'start': 1, 'end': 1})
else:
infile = PdfFileReader(open(filename, 'rb'), strict=False)
if pages == 'all':
p.append({'start': 1, 'end': infile.getNumPages()})
else:
for r in pages.split(','):
if '-' in r:
a, b = r.split('-')
if b == 'end':
b = infile.getNumPages()
p.append({'start': int(a), 'end': int(b)})
else:
p.append({'start': int(r), 'end': int(r)})
return p
@staticmethod
def __save_page(filename, page, temp):
with open(filename, 'rb') as fileobj:
infile = PdfFileReader(fileobj, strict=False)
fpath = os.path.join(temp, 'page-{0}.pdf'.format(page))
fname, fext = os.path.splitext(fpath)
p = infile.getPage(page - 1)
outfile = PdfFileWriter()
outfile.addPage(p)
with open(fpath, 'wb') as f:
outfile.write(f)
layout, dim = get_page_layout(fpath)
# fix rotated pdf
lttextlh = get_text_objects(layout, ltype="lh")
lttextlv = get_text_objects(layout, ltype="lv")
ltchar = get_text_objects(layout, ltype="char")
rotation = get_rotation(lttextlh, lttextlv, ltchar)
if rotation != '':
fpath_new = ''.join([fname.replace('page', 'p'), '_rotated', fext])
os.rename(fpath, fpath_new)
infile = PdfFileReader(open(fpath_new, 'rb'), strict=False)
outfile = PdfFileWriter()
p = infile.getPage(0)
if rotation == 'left':
p.rotateClockwise(90)
elif rotation == 'right':
p.rotateCounterClockwise(90)
outfile.addPage(p)
with open(fpath, 'wb') as f:
outfile.write(f)
def parse(self):
for p in self.pages:
__save_page(self.filename, p, self.temp)
pages = [os.path.join(self.temp, 'page-{0}.pdf'.format(p))
for p in self.pagenos]
tables = {}
for p in pages:
table = self.parser.get_tables(p)
if table is not None:
tables.update(table)
return tables

View File

@ -0,0 +1,83 @@
import os
import tempfile
from PyPDF2 import PdfFileReader, PdfFileWriter
from .parsers import Stream, Lattice
from .utils import get_page_layout, get_text_objects, get_rotation
class PDFHandler(object):
def __init__(self, filename, pages='1'):
self.filename = filename
if not self.filename.endswith('.pdf'):
raise TypeError("File format not supported.")
self.pages = self.__get_pages(self.filename, pages)
self.temp = tempfile.mkdtemp()
def __get_pages(self, filename, pages):
# refactor
page_numbers = []
if pages == '1':
page_numbers.append({'start': 1, 'end': 1})
else:
infile = PdfFileReader(open(filename, 'rb'), strict=False)
if pages == 'all':
page_numbers.append({'start': 1, 'end': infile.getNumPages()})
else:
for r in pages.split(','):
if '-' in r:
a, b = r.split('-')
if b == 'end':
b = infile.getNumPages()
page_numbers.append({'start': int(a), 'end': int(b)})
else:
page_numbers.append({'start': int(r), 'end': int(r)})
P = []
for p in page_numbers:
P.extend(range(p['start'], p['end'] + 1))
return sorted(set(P))
def __save_page(self, filename, page, temp):
# refactor
with open(filename, 'rb') as fileobj:
infile = PdfFileReader(fileobj, strict=False)
fpath = os.path.join(temp, 'page-{0}.pdf'.format(page))
fname, fext = os.path.splitext(fpath)
p = infile.getPage(page - 1)
outfile = PdfFileWriter()
outfile.addPage(p)
with open(fpath, 'wb') as f:
outfile.write(f)
layout, dim = get_page_layout(fpath)
# fix rotated pdf
lttextlh = get_text_objects(layout, ltype="lh")
lttextlv = get_text_objects(layout, ltype="lv")
ltchar = get_text_objects(layout, ltype="char")
rotation = get_rotation(lttextlh, lttextlv, ltchar)
if rotation != '':
fpath_new = ''.join([fname.replace('page', 'p'), '_rotated', fext])
os.rename(fpath, fpath_new)
infile = PdfFileReader(open(fpath_new, 'rb'), strict=False)
outfile = PdfFileWriter()
p = infile.getPage(0)
if rotation == 'left':
p.rotateClockwise(90)
elif rotation == 'right':
p.rotateCounterClockwise(90)
outfile.addPage(p)
with open(fpath, 'wb') as f:
outfile.write(f)
def parse(self, mesh=False, **kwargs):
for p in self.pages:
self.__save_page(self.filename, p, self.temp)
pages = [os.path.join(self.temp, 'page-{0}.pdf'.format(p))
for p in self.pages]
tables = {}
parser = Stream(**kwargs) if not mesh else Lattice(**kwargs)
for p in pages:
table = parser.get_tables(p)
if table is not None:
tables.update(table)
return tables

View File

@ -1,5 +1,7 @@
from .core import * from .handlers import PDFHandler
def read_pdf(filepath, pages='1', grid=True): def read_pdf(filepath, pages='1', mesh=False, **kwargs):
pass # explicit type conversion
p = PDFHandler(filepath, pages)
return p.parse(mesh=mesh, **kwargs)

View File

@ -10,7 +10,7 @@ import subprocess
import numpy as np import numpy as np
from .core import TableSet from .core import Table
from .image_processing import (adaptive_threshold, find_lines, find_table_contours, from .image_processing import (adaptive_threshold, find_lines, find_table_contours,
find_table_joints) find_table_joints)
from .utils import (scale_to_pdf, scale_to_image, segments_bbox, text_in_bbox, from .utils import (scale_to_pdf, scale_to_image, segments_bbox, text_in_bbox,