Compare commits
1 Commits
| Author | SHA1 | Date |
|---|---|---|
|
|
6744e0525a |
|
|
@ -2,6 +2,7 @@
|
||||||
|
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
|
import copy
|
||||||
|
|
||||||
from PyPDF2 import PdfFileReader, PdfFileWriter
|
from PyPDF2 import PdfFileReader, PdfFileWriter
|
||||||
|
|
||||||
|
|
@ -89,51 +90,56 @@ class PDFHandler(object):
|
||||||
P.extend(range(p["start"], p["end"] + 1))
|
P.extend(range(p["start"], p["end"] + 1))
|
||||||
return sorted(set(P))
|
return sorted(set(P))
|
||||||
|
|
||||||
def _save_page(self, filepath, page, temp):
|
def _save_pages(self, filepath, pages, temp):
|
||||||
"""Saves specified page from PDF into a temporary directory.
|
"""Saves specified page from PDF into a temporary directory.
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
filepath : str
|
filepath : str
|
||||||
Filepath or URL of the PDF file.
|
Filepath or URL of the PDF file.
|
||||||
page : int
|
pages : int
|
||||||
Page number.
|
Page numbers.
|
||||||
temp : str
|
temp : str
|
||||||
Tmp directory.
|
Tmp directory.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
with open(filepath, "rb") as fileobj:
|
with open(filepath, "rb") as fileobj:
|
||||||
infile = PdfFileReader(fileobj, strict=False)
|
infile_original = PdfFileReader(fileobj, strict=False)
|
||||||
if infile.isEncrypted:
|
if infile_original.isEncrypted:
|
||||||
infile.decrypt(self.password)
|
infile_original.decrypt(self.password)
|
||||||
fpath = os.path.join(temp, "page-{0}.pdf".format(page))
|
|
||||||
froot, fext = os.path.splitext(fpath)
|
for page in pages:
|
||||||
p = infile.getPage(page - 1)
|
# Ensure PdfFileReader object is unmodified
|
||||||
outfile = PdfFileWriter()
|
infile = copy.copy(infile_original)
|
||||||
outfile.addPage(p)
|
fpath = os.path.join(temp, 'page-{0}.pdf'.format(page))
|
||||||
with open(fpath, "wb") as f:
|
froot, fext = os.path.splitext(fpath)
|
||||||
outfile.write(f)
|
p = infile.getPage(page - 1)
|
||||||
layout, dim = get_page_layout(fpath)
|
|
||||||
# fix rotated PDF
|
|
||||||
chars = get_text_objects(layout, ltype="char")
|
|
||||||
horizontal_text = get_text_objects(layout, ltype="horizontal_text")
|
|
||||||
vertical_text = get_text_objects(layout, ltype="vertical_text")
|
|
||||||
rotation = get_rotation(chars, horizontal_text, vertical_text)
|
|
||||||
if rotation != "":
|
|
||||||
fpath_new = "".join([froot.replace("page", "p"), "_rotated", fext])
|
|
||||||
os.rename(fpath, fpath_new)
|
|
||||||
infile = PdfFileReader(open(fpath_new, "rb"), strict=False)
|
|
||||||
if infile.isEncrypted:
|
|
||||||
infile.decrypt(self.password)
|
|
||||||
outfile = PdfFileWriter()
|
outfile = PdfFileWriter()
|
||||||
p = infile.getPage(0)
|
|
||||||
if rotation == "anticlockwise":
|
|
||||||
p.rotateClockwise(90)
|
|
||||||
elif rotation == "clockwise":
|
|
||||||
p.rotateCounterClockwise(90)
|
|
||||||
outfile.addPage(p)
|
outfile.addPage(p)
|
||||||
with open(fpath, "wb") as f:
|
with open(fpath, 'wb') as f:
|
||||||
outfile.write(f)
|
outfile.write(f)
|
||||||
|
|
||||||
|
# Orient rotated pages correctly
|
||||||
|
layout, dim = get_page_layout(fpath)
|
||||||
|
chars = get_text_objects(layout, ltype="char")
|
||||||
|
horizontal_text = get_text_objects(layout, ltype="horizontal_text")
|
||||||
|
vertical_text = get_text_objects(layout, ltype="vertical_text")
|
||||||
|
rotation = get_rotation(chars, horizontal_text, vertical_text)
|
||||||
|
if rotation != '':
|
||||||
|
fpath_new = ''.join([froot.replace('page', 'p'), '_rotated', fext])
|
||||||
|
os.rename(fpath, fpath_new)
|
||||||
|
infile = PdfFileReader(open(fpath_new, 'rb'), strict=False)
|
||||||
|
if infile.isEncrypted:
|
||||||
|
infile.decrypt(self.password)
|
||||||
|
outfile = PdfFileWriter()
|
||||||
|
p = infile.getPage(0)
|
||||||
|
if rotation == 'anticlockwise':
|
||||||
|
p.rotateClockwise(90)
|
||||||
|
elif rotation == 'clockwise':
|
||||||
|
p.rotateCounterClockwise(90)
|
||||||
|
outfile.addPage(p)
|
||||||
|
with open(fpath, 'wb') as f:
|
||||||
|
outfile.write(f)
|
||||||
|
|
||||||
def parse(
|
def parse(
|
||||||
self, flavor="lattice", suppress_stdout=False, layout_kwargs={}, **kwargs
|
self, flavor="lattice", suppress_stdout=False, layout_kwargs={}, **kwargs
|
||||||
|
|
@ -161,8 +167,7 @@ class PDFHandler(object):
|
||||||
"""
|
"""
|
||||||
tables = []
|
tables = []
|
||||||
with TemporaryDirectory() as tempdir:
|
with TemporaryDirectory() as tempdir:
|
||||||
for p in self.pages:
|
self._save_pages(self.filepath, self.pages, tempdir)
|
||||||
self._save_page(self.filepath, p, tempdir)
|
|
||||||
pages = [
|
pages = [
|
||||||
os.path.join(tempdir, "page-{0}.pdf".format(p)) for p in self.pages
|
os.path.join(tempdir, "page-{0}.pdf".format(p)) for p in self.pages
|
||||||
]
|
]
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue