fix-21
Vinayak Mehta 2019-10-15 15:24:21 +05:30
parent f1879726d9
commit 6744e0525a
1 changed files with 39 additions and 34 deletions

View File

@ -2,6 +2,7 @@
import os import os
import sys import sys
import copy
from PyPDF2 import PdfFileReader, PdfFileWriter from PyPDF2 import PdfFileReader, PdfFileWriter
@ -89,50 +90,55 @@ class PDFHandler(object):
P.extend(range(p["start"], p["end"] + 1)) P.extend(range(p["start"], p["end"] + 1))
return sorted(set(P)) return sorted(set(P))
def _save_page(self, filepath, page, temp): def _save_pages(self, filepath, pages, temp):
"""Saves specified page from PDF into a temporary directory. """Saves specified page from PDF into a temporary directory.
Parameters Parameters
---------- ----------
filepath : str filepath : str
Filepath or URL of the PDF file. Filepath or URL of the PDF file.
page : int pages : int
Page number. Page numbers.
temp : str temp : str
Tmp directory. Tmp directory.
""" """
with open(filepath, "rb") as fileobj: with open(filepath, "rb") as fileobj:
infile = PdfFileReader(fileobj, strict=False) infile_original = PdfFileReader(fileobj, strict=False)
if infile.isEncrypted: if infile_original.isEncrypted:
infile.decrypt(self.password) infile_original.decrypt(self.password)
fpath = os.path.join(temp, "page-{0}.pdf".format(page))
for page in pages:
# Ensure PdfFileReader object is unmodified
infile = copy.copy(infile_original)
fpath = os.path.join(temp, 'page-{0}.pdf'.format(page))
froot, fext = os.path.splitext(fpath) froot, fext = os.path.splitext(fpath)
p = infile.getPage(page - 1) p = infile.getPage(page - 1)
outfile = PdfFileWriter() outfile = PdfFileWriter()
outfile.addPage(p) outfile.addPage(p)
with open(fpath, "wb") as f: with open(fpath, 'wb') as f:
outfile.write(f) outfile.write(f)
# Orient rotated pages correctly
layout, dim = get_page_layout(fpath) layout, dim = get_page_layout(fpath)
# fix rotated PDF
chars = get_text_objects(layout, ltype="char") chars = get_text_objects(layout, ltype="char")
horizontal_text = get_text_objects(layout, ltype="horizontal_text") horizontal_text = get_text_objects(layout, ltype="horizontal_text")
vertical_text = get_text_objects(layout, ltype="vertical_text") vertical_text = get_text_objects(layout, ltype="vertical_text")
rotation = get_rotation(chars, horizontal_text, vertical_text) rotation = get_rotation(chars, horizontal_text, vertical_text)
if rotation != "": if rotation != '':
fpath_new = "".join([froot.replace("page", "p"), "_rotated", fext]) fpath_new = ''.join([froot.replace('page', 'p'), '_rotated', fext])
os.rename(fpath, fpath_new) os.rename(fpath, fpath_new)
infile = PdfFileReader(open(fpath_new, "rb"), strict=False) infile = PdfFileReader(open(fpath_new, 'rb'), strict=False)
if infile.isEncrypted: if infile.isEncrypted:
infile.decrypt(self.password) infile.decrypt(self.password)
outfile = PdfFileWriter() outfile = PdfFileWriter()
p = infile.getPage(0) p = infile.getPage(0)
if rotation == "anticlockwise": if rotation == 'anticlockwise':
p.rotateClockwise(90) p.rotateClockwise(90)
elif rotation == "clockwise": elif rotation == 'clockwise':
p.rotateCounterClockwise(90) p.rotateCounterClockwise(90)
outfile.addPage(p) outfile.addPage(p)
with open(fpath, "wb") as f: with open(fpath, 'wb') as f:
outfile.write(f) outfile.write(f)
def parse( def parse(
@ -161,8 +167,7 @@ class PDFHandler(object):
""" """
tables = [] tables = []
with TemporaryDirectory() as tempdir: with TemporaryDirectory() as tempdir:
for p in self.pages: self._save_pages(self.filepath, self.pages, tempdir)
self._save_page(self.filepath, p, tempdir)
pages = [ pages = [
os.path.join(tempdir, "page-{0}.pdf".format(p)) for p in self.pages os.path.join(tempdir, "page-{0}.pdf".format(p)) for p in self.pages
] ]