Add temporary directory context manager
parent
5116234bc7
commit
d3beaafc99
|
|
@ -1,11 +1,11 @@
|
||||||
import os
|
import os
|
||||||
import tempfile
|
|
||||||
|
|
||||||
from PyPDF2 import PdfFileReader, PdfFileWriter
|
from PyPDF2 import PdfFileReader, PdfFileWriter
|
||||||
|
|
||||||
from .core import TableList, GeometryList
|
from .core import TableList, GeometryList
|
||||||
from .parsers import Stream, Lattice
|
from .parsers import Stream, Lattice
|
||||||
from .utils import get_page_layout, get_text_objects, get_rotation
|
from .utils import (TemporaryDirectory, get_page_layout, get_text_objects,
|
||||||
|
get_rotation)
|
||||||
|
|
||||||
|
|
||||||
class PDFHandler(object):
|
class PDFHandler(object):
|
||||||
|
|
@ -27,7 +27,6 @@ class PDFHandler(object):
|
||||||
if not self.filename.endswith('.pdf'):
|
if not self.filename.endswith('.pdf'):
|
||||||
raise TypeError("File format not supported.")
|
raise TypeError("File format not supported.")
|
||||||
self.pages = self._get_pages(self.filename, pages)
|
self.pages = self._get_pages(self.filename, pages)
|
||||||
self.tempdir = tempfile.mkdtemp()
|
|
||||||
|
|
||||||
def _get_pages(self, filename, pages):
|
def _get_pages(self, filename, pages):
|
||||||
"""Converts pages string to list of ints.
|
"""Converts pages string to list of ints.
|
||||||
|
|
@ -130,15 +129,16 @@ class PDFHandler(object):
|
||||||
found in pdf.
|
found in pdf.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
for p in self.pages:
|
|
||||||
self._save_page(self.filename, p, self.tempdir)
|
|
||||||
pages = [os.path.join(self.tempdir, 'page-{0}.pdf'.format(p))
|
|
||||||
for p in self.pages]
|
|
||||||
tables = []
|
tables = []
|
||||||
geometry = []
|
geometry = []
|
||||||
parser = Stream(**kwargs) if not mesh else Lattice(**kwargs)
|
with TemporaryDirectory() as tempdir:
|
||||||
for p in pages:
|
for p in self.pages:
|
||||||
t, g = parser.extract_tables(p)
|
self._save_page(self.filename, p, tempdir)
|
||||||
tables.extend(t)
|
pages = [os.path.join(tempdir, 'page-{0}.pdf'.format(p))
|
||||||
geometry.append(g)
|
for p in self.pages]
|
||||||
|
parser = Stream(**kwargs) if not mesh else Lattice(**kwargs)
|
||||||
|
for p in pages:
|
||||||
|
t, g = parser.extract_tables(p)
|
||||||
|
tables.extend(t)
|
||||||
|
geometry.append(g)
|
||||||
return TableList(tables), GeometryList(geometry)
|
return TableList(tables), GeometryList(geometry)
|
||||||
|
|
@ -1,6 +1,8 @@
|
||||||
from __future__ import division
|
from __future__ import division
|
||||||
import os
|
import os
|
||||||
|
import shutil
|
||||||
import logging
|
import logging
|
||||||
|
import tempfile
|
||||||
from itertools import groupby
|
from itertools import groupby
|
||||||
from operator import itemgetter
|
from operator import itemgetter
|
||||||
|
|
||||||
|
|
@ -18,6 +20,16 @@ from pdfminer.layout import (LAParams, LTAnno, LTChar, LTTextLineHorizontal,
|
||||||
LTTextLineVertical)
|
LTTextLineVertical)
|
||||||
|
|
||||||
|
|
||||||
|
# https://stackoverflow.com/a/22726782
|
||||||
|
class TemporaryDirectory(object):
|
||||||
|
def __enter__(self):
|
||||||
|
self.name = tempfile.mkdtemp()
|
||||||
|
return self.name
|
||||||
|
|
||||||
|
def __exit__(self, exc_type, exc_value, traceback):
|
||||||
|
shutil.rmtree(self.name)
|
||||||
|
|
||||||
|
|
||||||
def setup_logging(name):
|
def setup_logging(name):
|
||||||
"""Sets up a logger with StreamHandler.
|
"""Sets up a logger with StreamHandler.
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue