fix-21
Vinayak Mehta 2019-10-15 15:24:21 +05:30
parent f1879726d9
commit 6744e0525a
1 changed files with 39 additions and 34 deletions

View File

@ -2,6 +2,7 @@
import os
import sys
import copy
from PyPDF2 import PdfFileReader, PdfFileWriter
@ -89,50 +90,55 @@ class PDFHandler(object):
P.extend(range(p["start"], p["end"] + 1))
return sorted(set(P))
def _save_page(self, filepath, page, temp):
def _save_pages(self, filepath, pages, temp):
"""Saves specified page from PDF into a temporary directory.
Parameters
----------
filepath : str
Filepath or URL of the PDF file.
page : int
Page number.
pages : int
Page numbers.
temp : str
Tmp directory.
"""
with open(filepath, "rb") as fileobj:
infile = PdfFileReader(fileobj, strict=False)
if infile.isEncrypted:
infile.decrypt(self.password)
fpath = os.path.join(temp, "page-{0}.pdf".format(page))
infile_original = PdfFileReader(fileobj, strict=False)
if infile_original.isEncrypted:
infile_original.decrypt(self.password)
for page in pages:
# Ensure PdfFileReader object is unmodified
infile = copy.copy(infile_original)
fpath = os.path.join(temp, 'page-{0}.pdf'.format(page))
froot, fext = os.path.splitext(fpath)
p = infile.getPage(page - 1)
outfile = PdfFileWriter()
outfile.addPage(p)
with open(fpath, "wb") as f:
with open(fpath, 'wb') as f:
outfile.write(f)
# Orient rotated pages correctly
layout, dim = get_page_layout(fpath)
# fix rotated PDF
chars = get_text_objects(layout, ltype="char")
horizontal_text = get_text_objects(layout, ltype="horizontal_text")
vertical_text = get_text_objects(layout, ltype="vertical_text")
rotation = get_rotation(chars, horizontal_text, vertical_text)
if rotation != "":
fpath_new = "".join([froot.replace("page", "p"), "_rotated", fext])
if rotation != '':
fpath_new = ''.join([froot.replace('page', 'p'), '_rotated', fext])
os.rename(fpath, fpath_new)
infile = PdfFileReader(open(fpath_new, "rb"), strict=False)
infile = PdfFileReader(open(fpath_new, 'rb'), strict=False)
if infile.isEncrypted:
infile.decrypt(self.password)
outfile = PdfFileWriter()
p = infile.getPage(0)
if rotation == "anticlockwise":
if rotation == 'anticlockwise':
p.rotateClockwise(90)
elif rotation == "clockwise":
elif rotation == 'clockwise':
p.rotateCounterClockwise(90)
outfile.addPage(p)
with open(fpath, "wb") as f:
with open(fpath, 'wb') as f:
outfile.write(f)
def parse(
@ -161,8 +167,7 @@ class PDFHandler(object):
"""
tables = []
with TemporaryDirectory() as tempdir:
for p in self.pages:
self._save_page(self.filepath, p, tempdir)
self._save_pages(self.filepath, self.pages, tempdir)
pages = [
os.path.join(tempdir, "page-{0}.pdf".format(p)) for p in self.pages
]