Parallelize pdf split

pull/2/head
Vinayak Mehta 2017-04-11 18:30:05 +05:30
parent 4a87a77003
commit 7246e1a73d
1 changed files with 38 additions and 28 deletions

View File

@ -4,6 +4,7 @@ import logging
import tempfile import tempfile
import itertools import itertools
import multiprocessing as mp import multiprocessing as mp
from functools import partial
import cv2 import cv2
from PyPDF2 import PdfFileReader, PdfFileWriter from PyPDF2 import PdfFileReader, PdfFileWriter
@ -37,6 +38,37 @@ def _parse_page_numbers(pagenos):
return page_numbers return page_numbers
def _save_page(temp, pdfname, pageno):
with open(pdfname, 'rb') as pdffile:
infile = PdfFileReader(pdffile, strict=False)
sp_path = os.path.join(temp, 'page-{0}.pdf'.format(pageno))
sp_name, sp_ext = os.path.splitext(sp_path)
page = infile.getPage(pageno - 1)
outfile = PdfFileWriter()
outfile.addPage(page)
with open(sp_path, 'wb') as f:
outfile.write(f)
layout, dim = get_page_layout(sp_path)
lttextlh = get_text_objects(layout, ltype="lh")
lttextlv = get_text_objects(layout, ltype="lv")
ltchar = get_text_objects(layout, ltype="char")
rotation = get_rotation(lttextlh, lttextlv, ltchar)
if rotation != '':
sp_new_path = ''.join([sp_name.replace('page', 'p'), '_rotated', sp_ext])
os.rename(sp_path, sp_new_path)
sp_in = PdfFileReader(open(sp_new_path, 'rb'),
strict=False)
sp_out = PdfFileWriter()
sp_page = sp_in.getPage(0)
if rotation == 'left':
sp_page.rotateClockwise(90)
elif rotation == 'right':
sp_page.rotateCounterClockwise(90)
sp_out.addPage(sp_page)
with open(sp_path, 'wb') as pdf_out:
sp_out.write(pdf_out)
class Pdf: class Pdf:
"""Pdf manager. """Pdf manager.
Handles all operations like temp directory creation, splitting file Handles all operations like temp directory creation, splitting file
@ -85,34 +117,12 @@ class Pdf:
"""Splits file into single page pdfs. """Splits file into single page pdfs.
""" """
logger.info('Splitting pages...') logger.info('Splitting pages...')
infile = PdfFileReader(open(self.pdfname, 'rb'), strict=False) if self.parallel:
pfunc = partial(_save_page, self.temp, self.pdfname)
self.pool.map(pfunc, self.pagenos)
else:
for p in self.pagenos: for p in self.pagenos:
sp_path = os.path.join(self.temp, 'page-{0}.pdf'.format(p)) _save_page(self.temp, self.pdfname, p)
sp_name, sp_ext = os.path.splitext(sp_path)
page = infile.getPage(p - 1)
outfile = PdfFileWriter()
outfile.addPage(page)
with open(sp_path, 'wb') as f:
outfile.write(f)
layout, dim = get_page_layout(sp_path)
lttextlh = get_text_objects(layout, ltype="lh")
lttextlv = get_text_objects(layout, ltype="lv")
ltchar = get_text_objects(layout, ltype="char")
rotation = get_rotation(lttextlh, lttextlv, ltchar)
if rotation != '':
sp_new_path = ''.join([sp_name.replace('page', 'p'), '_rotated', sp_ext])
os.rename(sp_path, sp_new_path)
sp_in = PdfFileReader(open(sp_new_path, 'rb'),
strict=False)
sp_out = PdfFileWriter()
sp_page = sp_in.getPage(0)
if rotation == 'left':
sp_page.rotateClockwise(90)
elif rotation == 'right':
sp_page.rotateCounterClockwise(90)
sp_out.addPage(sp_page)
with open(sp_path, 'wb') as pdf_out:
sp_out.write(pdf_out)
def extract(self): def extract(self):