pull/2/head
Vinayak Mehta 2019-01-03 14:55:47 +05:30
parent 7a0acd7929
commit f605bd8f94
5 changed files with 30 additions and 19 deletions

View File

@ -107,10 +107,10 @@ class PDFHandler(object):
outfile.write(f) outfile.write(f)
layout, dim = get_page_layout(fpath) layout, dim = get_page_layout(fpath)
# fix rotated PDF # fix rotated PDF
lttextlh = get_text_objects(layout, ltype="lh") chars = get_text_objects(layout, ltype="char")
lttextlv = get_text_objects(layout, ltype="lv") horizontal_text = get_text_objects(layout, ltype="horizontal_text")
ltchar = get_text_objects(layout, ltype="char") vertical_text = get_text_objects(layout, ltype="vertical_text")
rotation = get_rotation(lttextlh, lttextlv, ltchar) rotation = get_rotation(chars, horizontal_text, vertical_text)
if rotation != '': if rotation != '':
fpath_new = ''.join([froot.replace('page', 'p'), '_rotated', fext]) fpath_new = ''.join([froot.replace('page', 'p'), '_rotated', fext])
os.rename(fpath, fpath_new) os.rename(fpath, fpath_new)

View File

@ -13,7 +13,8 @@ class BaseParser(object):
self.layout_kwargs = layout_kwargs self.layout_kwargs = layout_kwargs
self.layout, self.dimensions = get_page_layout( self.layout, self.dimensions = get_page_layout(
filename, **layout_kwargs) filename, **layout_kwargs)
self.horizontal_text = get_text_objects(self.layout, ltype="lh") self.images = get_text_objects(self.layout, ltype='image')
self.vertical_text = get_text_objects(self.layout, ltype="lv") self.horizontal_text = get_text_objects(self.layout, ltype='horizontal_text')
self.vertical_text = get_text_objects(self.layout, ltype='vertical_text')
self.pdf_width, self.pdf_height = self.dimensions self.pdf_width, self.pdf_height = self.dimensions
self.rootname, __ = os.path.splitext(self.filename) self.rootname, __ = os.path.splitext(self.filename)

View File

@ -356,7 +356,11 @@ class Lattice(BaseParser):
logger.info('Processing {}'.format(os.path.basename(self.rootname))) logger.info('Processing {}'.format(os.path.basename(self.rootname)))
if not self.horizontal_text: if not self.horizontal_text:
warnings.warn("No tables found on {}".format( if self.images:
warnings.warn('The page is image-based, Camelot only works with'
' text-based PDF pages.'.format(os.path.basename(self.rootname)))
else:
warnings.warn('No tables found on {}'.format(
os.path.basename(self.rootname))) os.path.basename(self.rootname)))
return [] return []

View File

@ -395,7 +395,11 @@ class Stream(BaseParser):
logger.info('Processing {}'.format(os.path.basename(self.rootname))) logger.info('Processing {}'.format(os.path.basename(self.rootname)))
if not self.horizontal_text: if not self.horizontal_text:
warnings.warn("No tables found on {}".format( if self.images:
warnings.warn('The page is image-based, Camelot only works with'
' text-based PDF pages.'.format(os.path.basename(self.rootname)))
else:
warnings.warn('No tables found on {}'.format(
os.path.basename(self.rootname))) os.path.basename(self.rootname)))
return [] return []

View File

@ -20,7 +20,7 @@ from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfinterp import PDFPageInterpreter from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.converter import PDFPageAggregator from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import (LAParams, LTAnno, LTChar, LTTextLineHorizontal, from pdfminer.layout import (LAParams, LTAnno, LTChar, LTTextLineHorizontal,
LTTextLineVertical) LTTextLineVertical, LTImage)
PY3 = sys.version_info[0] >= 3 PY3 = sys.version_info[0] >= 3
@ -270,15 +270,15 @@ def scale_image(tables, v_segments, h_segments, factors):
return tables_new, v_segments_new, h_segments_new return tables_new, v_segments_new, h_segments_new
def get_rotation(lttextlh, lttextlv, ltchar): def get_rotation(chars, horizontal_text, vertical_text):
"""Detects if text in table is rotated or not using the current """Detects if text in table is rotated or not using the current
transformation matrix (CTM) and returns its orientation. transformation matrix (CTM) and returns its orientation.
Parameters Parameters
---------- ----------
lttextlh : list horizontal_text : list
List of PDFMiner LTTextLineHorizontal objects. List of PDFMiner LTTextLineHorizontal objects.
lttextlv : list vertical_text : list
List of PDFMiner LTTextLineVertical objects. List of PDFMiner LTTextLineVertical objects.
ltchar : list ltchar : list
List of PDFMiner LTChar objects. List of PDFMiner LTChar objects.
@ -292,8 +292,8 @@ def get_rotation(lttextlh, lttextlv, ltchar):
""" """
rotation = '' rotation = ''
hlen = len([t for t in lttextlh if t.get_text().strip()]) hlen = len([t for t in horizontal_text if t.get_text().strip()])
vlen = len([t for t in lttextlv if t.get_text().strip()]) vlen = len([t for t in vertical_text if t.get_text().strip()])
if hlen < vlen: if hlen < vlen:
clockwise = sum(t.matrix[1] < 0 and t.matrix[2] > 0 for t in ltchar) clockwise = sum(t.matrix[1] < 0 and t.matrix[2] > 0 for t in ltchar)
anticlockwise = sum(t.matrix[1] > 0 and t.matrix[2] < 0 for t in ltchar) anticlockwise = sum(t.matrix[1] > 0 and t.matrix[2] < 0 for t in ltchar)
@ -713,11 +713,13 @@ def get_text_objects(layout, ltype="char", t=None):
List of PDFMiner text objects. List of PDFMiner text objects.
""" """
if ltype == "char": if ltype == 'char':
LTObject = LTChar LTObject = LTChar
elif ltype == "lh": elif ltype == 'image':
LTObject = LTImage
elif ltype == 'horizontal_text':
LTObject = LTTextLineHorizontal LTObject = LTTextLineHorizontal
elif ltype == "lv": elif ltype == 'vertical_text':
LTObject = LTTextLineVertical LTObject = LTTextLineVertical
if t is None: if t is None:
t = [] t = []