Fix #239
parent
7a0acd7929
commit
f605bd8f94
|
|
@ -107,10 +107,10 @@ class PDFHandler(object):
|
|||
outfile.write(f)
|
||||
layout, dim = get_page_layout(fpath)
|
||||
# fix rotated PDF
|
||||
lttextlh = get_text_objects(layout, ltype="lh")
|
||||
lttextlv = get_text_objects(layout, ltype="lv")
|
||||
ltchar = get_text_objects(layout, ltype="char")
|
||||
rotation = get_rotation(lttextlh, lttextlv, ltchar)
|
||||
chars = get_text_objects(layout, ltype="char")
|
||||
horizontal_text = get_text_objects(layout, ltype="horizontal_text")
|
||||
vertical_text = get_text_objects(layout, ltype="vertical_text")
|
||||
rotation = get_rotation(chars, horizontal_text, vertical_text)
|
||||
if rotation != '':
|
||||
fpath_new = ''.join([froot.replace('page', 'p'), '_rotated', fext])
|
||||
os.rename(fpath, fpath_new)
|
||||
|
|
|
|||
|
|
@ -13,7 +13,8 @@ class BaseParser(object):
|
|||
self.layout_kwargs = layout_kwargs
|
||||
self.layout, self.dimensions = get_page_layout(
|
||||
filename, **layout_kwargs)
|
||||
self.horizontal_text = get_text_objects(self.layout, ltype="lh")
|
||||
self.vertical_text = get_text_objects(self.layout, ltype="lv")
|
||||
self.images = get_text_objects(self.layout, ltype='image')
|
||||
self.horizontal_text = get_text_objects(self.layout, ltype='horizontal_text')
|
||||
self.vertical_text = get_text_objects(self.layout, ltype='vertical_text')
|
||||
self.pdf_width, self.pdf_height = self.dimensions
|
||||
self.rootname, __ = os.path.splitext(self.filename)
|
||||
|
|
|
|||
|
|
@ -356,7 +356,11 @@ class Lattice(BaseParser):
|
|||
logger.info('Processing {}'.format(os.path.basename(self.rootname)))
|
||||
|
||||
if not self.horizontal_text:
|
||||
warnings.warn("No tables found on {}".format(
|
||||
if self.images:
|
||||
warnings.warn('The page is image-based, Camelot only works with'
|
||||
' text-based PDF pages.'.format(os.path.basename(self.rootname)))
|
||||
else:
|
||||
warnings.warn('No tables found on {}'.format(
|
||||
os.path.basename(self.rootname)))
|
||||
return []
|
||||
|
||||
|
|
|
|||
|
|
@ -395,7 +395,11 @@ class Stream(BaseParser):
|
|||
logger.info('Processing {}'.format(os.path.basename(self.rootname)))
|
||||
|
||||
if not self.horizontal_text:
|
||||
warnings.warn("No tables found on {}".format(
|
||||
if self.images:
|
||||
warnings.warn('The page is image-based, Camelot only works with'
|
||||
' text-based PDF pages.'.format(os.path.basename(self.rootname)))
|
||||
else:
|
||||
warnings.warn('No tables found on {}'.format(
|
||||
os.path.basename(self.rootname)))
|
||||
return []
|
||||
|
||||
|
|
|
|||
|
|
@ -20,7 +20,7 @@ from pdfminer.pdfinterp import PDFResourceManager
|
|||
from pdfminer.pdfinterp import PDFPageInterpreter
|
||||
from pdfminer.converter import PDFPageAggregator
|
||||
from pdfminer.layout import (LAParams, LTAnno, LTChar, LTTextLineHorizontal,
|
||||
LTTextLineVertical)
|
||||
LTTextLineVertical, LTImage)
|
||||
|
||||
|
||||
PY3 = sys.version_info[0] >= 3
|
||||
|
|
@ -270,15 +270,15 @@ def scale_image(tables, v_segments, h_segments, factors):
|
|||
return tables_new, v_segments_new, h_segments_new
|
||||
|
||||
|
||||
def get_rotation(lttextlh, lttextlv, ltchar):
|
||||
def get_rotation(chars, horizontal_text, vertical_text):
|
||||
"""Detects if text in table is rotated or not using the current
|
||||
transformation matrix (CTM) and returns its orientation.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
lttextlh : list
|
||||
horizontal_text : list
|
||||
List of PDFMiner LTTextLineHorizontal objects.
|
||||
lttextlv : list
|
||||
vertical_text : list
|
||||
List of PDFMiner LTTextLineVertical objects.
|
||||
ltchar : list
|
||||
List of PDFMiner LTChar objects.
|
||||
|
|
@ -292,8 +292,8 @@ def get_rotation(lttextlh, lttextlv, ltchar):
|
|||
|
||||
"""
|
||||
rotation = ''
|
||||
hlen = len([t for t in lttextlh if t.get_text().strip()])
|
||||
vlen = len([t for t in lttextlv if t.get_text().strip()])
|
||||
hlen = len([t for t in horizontal_text if t.get_text().strip()])
|
||||
vlen = len([t for t in vertical_text if t.get_text().strip()])
|
||||
if hlen < vlen:
|
||||
clockwise = sum(t.matrix[1] < 0 and t.matrix[2] > 0 for t in ltchar)
|
||||
anticlockwise = sum(t.matrix[1] > 0 and t.matrix[2] < 0 for t in ltchar)
|
||||
|
|
@ -713,11 +713,13 @@ def get_text_objects(layout, ltype="char", t=None):
|
|||
List of PDFMiner text objects.
|
||||
|
||||
"""
|
||||
if ltype == "char":
|
||||
if ltype == 'char':
|
||||
LTObject = LTChar
|
||||
elif ltype == "lh":
|
||||
elif ltype == 'image':
|
||||
LTObject = LTImage
|
||||
elif ltype == 'horizontal_text':
|
||||
LTObject = LTTextLineHorizontal
|
||||
elif ltype == "lv":
|
||||
elif ltype == 'vertical_text':
|
||||
LTObject = LTTextLineVertical
|
||||
if t is None:
|
||||
t = []
|
||||
|
|
|
|||
Loading…
Reference in New Issue