diff --git a/camelot/__init__.py b/camelot/__init__.py index 24ff0f1..b762cea 100644 --- a/camelot/__init__.py +++ b/camelot/__init__.py @@ -1,3 +1,3 @@ from .__version__ import __version__ -from .io import * \ No newline at end of file +from .io import read_pdf \ No newline at end of file diff --git a/camelot/core.py b/camelot/core.py index 2cca37d..0b7a21b 100644 --- a/camelot/core.py +++ b/camelot/core.py @@ -1,11 +1,5 @@ -import os -import tempfile - import numpy as np import pandas as pd -from PyPDF2 import PdfFileReader, PdfFileWriter - -from .utils import get_page_layout, get_text_objects, get_rotation class Cell(object): @@ -238,76 +232,4 @@ class TableSet(object): pass def __repr__(self): - pass - - -class FileHandler(object): - def __init__(self, filename, pages='1'): - self.filename = filename - if not self.filename.endswith('.pdf'): - raise TypeError("File format not supported.") - self.pages = __get_pages(pages) - self.temp = tempfile.mkdtemp() - - @staticmethod - def __get_pages(filename, pages): - p = {} - if pages == '1': - p.append({'start': 1, 'end': 1}) - else: - infile = PdfFileReader(open(filename, 'rb'), strict=False) - if pages == 'all': - p.append({'start': 1, 'end': infile.getNumPages()}) - else: - for r in pages.split(','): - if '-' in r: - a, b = r.split('-') - if b == 'end': - b = infile.getNumPages() - p.append({'start': int(a), 'end': int(b)}) - else: - p.append({'start': int(r), 'end': int(r)}) - return p - - @staticmethod - def __save_page(filename, page, temp): - with open(filename, 'rb') as fileobj: - infile = PdfFileReader(fileobj, strict=False) - fpath = os.path.join(temp, 'page-{0}.pdf'.format(page)) - fname, fext = os.path.splitext(fpath) - p = infile.getPage(page - 1) - outfile = PdfFileWriter() - outfile.addPage(p) - with open(fpath, 'wb') as f: - outfile.write(f) - layout, dim = get_page_layout(fpath) - # fix rotated pdf - lttextlh = get_text_objects(layout, ltype="lh") - lttextlv = get_text_objects(layout, ltype="lv") - ltchar = get_text_objects(layout, ltype="char") - rotation = get_rotation(lttextlh, lttextlv, ltchar) - if rotation != '': - fpath_new = ''.join([fname.replace('page', 'p'), '_rotated', fext]) - os.rename(fpath, fpath_new) - infile = PdfFileReader(open(fpath_new, 'rb'), strict=False) - outfile = PdfFileWriter() - p = infile.getPage(0) - if rotation == 'left': - p.rotateClockwise(90) - elif rotation == 'right': - p.rotateCounterClockwise(90) - outfile.addPage(p) - with open(fpath, 'wb') as f: - outfile.write(f) - - def parse(self): - for p in self.pages: - __save_page(self.filename, p, self.temp) - pages = [os.path.join(self.temp, 'page-{0}.pdf'.format(p)) - for p in self.pagenos] - tables = {} - for p in pages: - table = self.parser.get_tables(p) - if table is not None: - tables.update(table) - return tables \ No newline at end of file + pass \ No newline at end of file diff --git a/camelot/handlers.py b/camelot/handlers.py new file mode 100644 index 0000000..83fe4ae --- /dev/null +++ b/camelot/handlers.py @@ -0,0 +1,83 @@ +import os +import tempfile + +from PyPDF2 import PdfFileReader, PdfFileWriter + +from .parsers import Stream, Lattice +from .utils import get_page_layout, get_text_objects, get_rotation + + +class PDFHandler(object): + def __init__(self, filename, pages='1'): + self.filename = filename + if not self.filename.endswith('.pdf'): + raise TypeError("File format not supported.") + self.pages = self.__get_pages(self.filename, pages) + self.temp = tempfile.mkdtemp() + + def __get_pages(self, filename, pages): + # refactor + page_numbers = [] + if pages == '1': + page_numbers.append({'start': 1, 'end': 1}) + else: + infile = PdfFileReader(open(filename, 'rb'), strict=False) + if pages == 'all': + page_numbers.append({'start': 1, 'end': infile.getNumPages()}) + else: + for r in pages.split(','): + if '-' in r: + a, b = r.split('-') + if b == 'end': + b = infile.getNumPages() + page_numbers.append({'start': int(a), 'end': int(b)}) + else: + page_numbers.append({'start': int(r), 'end': int(r)}) + P = [] + for p in page_numbers: + P.extend(range(p['start'], p['end'] + 1)) + return sorted(set(P)) + + def __save_page(self, filename, page, temp): + # refactor + with open(filename, 'rb') as fileobj: + infile = PdfFileReader(fileobj, strict=False) + fpath = os.path.join(temp, 'page-{0}.pdf'.format(page)) + fname, fext = os.path.splitext(fpath) + p = infile.getPage(page - 1) + outfile = PdfFileWriter() + outfile.addPage(p) + with open(fpath, 'wb') as f: + outfile.write(f) + layout, dim = get_page_layout(fpath) + # fix rotated pdf + lttextlh = get_text_objects(layout, ltype="lh") + lttextlv = get_text_objects(layout, ltype="lv") + ltchar = get_text_objects(layout, ltype="char") + rotation = get_rotation(lttextlh, lttextlv, ltchar) + if rotation != '': + fpath_new = ''.join([fname.replace('page', 'p'), '_rotated', fext]) + os.rename(fpath, fpath_new) + infile = PdfFileReader(open(fpath_new, 'rb'), strict=False) + outfile = PdfFileWriter() + p = infile.getPage(0) + if rotation == 'left': + p.rotateClockwise(90) + elif rotation == 'right': + p.rotateCounterClockwise(90) + outfile.addPage(p) + with open(fpath, 'wb') as f: + outfile.write(f) + + def parse(self, mesh=False, **kwargs): + for p in self.pages: + self.__save_page(self.filename, p, self.temp) + pages = [os.path.join(self.temp, 'page-{0}.pdf'.format(p)) + for p in self.pages] + tables = {} + parser = Stream(**kwargs) if not mesh else Lattice(**kwargs) + for p in pages: + table = parser.get_tables(p) + if table is not None: + tables.update(table) + return tables \ No newline at end of file diff --git a/camelot/io.py b/camelot/io.py index a78cf3a..fcae040 100644 --- a/camelot/io.py +++ b/camelot/io.py @@ -1,5 +1,7 @@ -from .core import * +from .handlers import PDFHandler -def read_pdf(filepath, pages='1', grid=True): - pass \ No newline at end of file +def read_pdf(filepath, pages='1', mesh=False, **kwargs): + # explicit type conversion + p = PDFHandler(filepath, pages) + return p.parse(mesh=mesh, **kwargs) \ No newline at end of file diff --git a/camelot/parsers.py b/camelot/parsers.py index f47f51a..c175fee 100644 --- a/camelot/parsers.py +++ b/camelot/parsers.py @@ -10,7 +10,7 @@ import subprocess import numpy as np -from .core import TableSet +from .core import Table from .image_processing import (adaptive_threshold, find_lines, find_table_contours, find_table_joints) from .utils import (scale_to_pdf, scale_to_image, segments_bbox, text_in_bbox,