Merge pull request #240 from socialcopsdev/raise-image-warning

[MRG] Add warning if PDF page is image-based
pull/2/head
Vinayak Mehta 2019-01-03 16:31:43 +05:30 committed by GitHub
commit 99eee608d7
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 41 additions and 21 deletions

View File

@ -107,10 +107,10 @@ class PDFHandler(object):
outfile.write(f)
layout, dim = get_page_layout(fpath)
# fix rotated PDF
lttextlh = get_text_objects(layout, ltype="lh")
lttextlv = get_text_objects(layout, ltype="lv")
ltchar = get_text_objects(layout, ltype="char")
rotation = get_rotation(lttextlh, lttextlv, ltchar)
chars = get_text_objects(layout, ltype="char")
horizontal_text = get_text_objects(layout, ltype="horizontal_text")
vertical_text = get_text_objects(layout, ltype="vertical_text")
rotation = get_rotation(chars, horizontal_text, vertical_text)
if rotation != '':
fpath_new = ''.join([froot.replace('page', 'p'), '_rotated', fext])
os.rename(fpath, fpath_new)

View File

@ -13,7 +13,8 @@ class BaseParser(object):
self.layout_kwargs = layout_kwargs
self.layout, self.dimensions = get_page_layout(
filename, **layout_kwargs)
self.horizontal_text = get_text_objects(self.layout, ltype="lh")
self.vertical_text = get_text_objects(self.layout, ltype="lv")
self.images = get_text_objects(self.layout, ltype='image')
self.horizontal_text = get_text_objects(self.layout, ltype='horizontal_text')
self.vertical_text = get_text_objects(self.layout, ltype='vertical_text')
self.pdf_width, self.pdf_height = self.dimensions
self.rootname, __ = os.path.splitext(self.filename)

View File

@ -356,7 +356,11 @@ class Lattice(BaseParser):
logger.info('Processing {}'.format(os.path.basename(self.rootname)))
if not self.horizontal_text:
warnings.warn("No tables found on {}".format(
if self.images:
warnings.warn('{} is image-based, camelot only works on'
' text-based pages.'.format(os.path.basename(self.rootname)))
else:
warnings.warn('No tables found on {}'.format(
os.path.basename(self.rootname)))
return []

View File

@ -395,7 +395,11 @@ class Stream(BaseParser):
logger.info('Processing {}'.format(os.path.basename(self.rootname)))
if not self.horizontal_text:
warnings.warn("No tables found on {}".format(
if self.images:
warnings.warn('{} is image-based, camelot only works on'
' text-based pages.'.format(os.path.basename(self.rootname)))
else:
warnings.warn('No tables found on {}'.format(
os.path.basename(self.rootname)))
return []

View File

@ -20,7 +20,7 @@ from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import (LAParams, LTAnno, LTChar, LTTextLineHorizontal,
LTTextLineVertical)
LTTextLineVertical, LTImage)
PY3 = sys.version_info[0] >= 3
@ -270,15 +270,15 @@ def scale_image(tables, v_segments, h_segments, factors):
return tables_new, v_segments_new, h_segments_new
def get_rotation(lttextlh, lttextlv, ltchar):
def get_rotation(chars, horizontal_text, vertical_text):
"""Detects if text in table is rotated or not using the current
transformation matrix (CTM) and returns its orientation.
Parameters
----------
lttextlh : list
horizontal_text : list
List of PDFMiner LTTextLineHorizontal objects.
lttextlv : list
vertical_text : list
List of PDFMiner LTTextLineVertical objects.
ltchar : list
List of PDFMiner LTChar objects.
@ -292,11 +292,11 @@ def get_rotation(lttextlh, lttextlv, ltchar):
"""
rotation = ''
hlen = len([t for t in lttextlh if t.get_text().strip()])
vlen = len([t for t in lttextlv if t.get_text().strip()])
hlen = len([t for t in horizontal_text if t.get_text().strip()])
vlen = len([t for t in vertical_text if t.get_text().strip()])
if hlen < vlen:
clockwise = sum(t.matrix[1] < 0 and t.matrix[2] > 0 for t in ltchar)
anticlockwise = sum(t.matrix[1] > 0 and t.matrix[2] < 0 for t in ltchar)
clockwise = sum(t.matrix[1] < 0 and t.matrix[2] > 0 for t in chars)
anticlockwise = sum(t.matrix[1] > 0 and t.matrix[2] < 0 for t in chars)
rotation = 'anticlockwise' if clockwise < anticlockwise else 'clockwise'
return rotation
@ -713,11 +713,13 @@ def get_text_objects(layout, ltype="char", t=None):
List of PDFMiner text objects.
"""
if ltype == "char":
if ltype == 'char':
LTObject = LTChar
elif ltype == "lh":
elif ltype == 'image':
LTObject = LTImage
elif ltype == 'horizontal_text':
LTObject = LTTextLineHorizontal
elif ltype == "lv":
elif ltype == 'vertical_text':
LTObject = LTTextLineVertical
if t is None:
t = []

Binary file not shown.

View File

@ -41,6 +41,15 @@ def test_stream_equal_length():
table_areas=['10,20,30,40'], columns=['10,20,30,40', '10,20,30,40'])
def test_image_warning():
filename = os.path.join(testdir, 'image.pdf')
with warnings.catch_warnings():
warnings.simplefilter('error')
with pytest.raises(UserWarning) as e:
tables = camelot.read_pdf(filename)
assert str(e.value) == 'page-1 is image-based, camelot only works on text-based pages.'
def test_no_tables_found():
filename = os.path.join(testdir, 'blank.pdf')
with warnings.catch_warnings():