Merge pull request #240 from socialcopsdev/raise-image-warning

[MRG] Add warning if PDF page is image-based
pull/2/head
Vinayak Mehta 2019-01-03 16:31:43 +05:30 committed by GitHub
commit 99eee608d7
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 41 additions and 21 deletions

View File

@ -107,10 +107,10 @@ class PDFHandler(object):
outfile.write(f) outfile.write(f)
layout, dim = get_page_layout(fpath) layout, dim = get_page_layout(fpath)
# fix rotated PDF # fix rotated PDF
lttextlh = get_text_objects(layout, ltype="lh") chars = get_text_objects(layout, ltype="char")
lttextlv = get_text_objects(layout, ltype="lv") horizontal_text = get_text_objects(layout, ltype="horizontal_text")
ltchar = get_text_objects(layout, ltype="char") vertical_text = get_text_objects(layout, ltype="vertical_text")
rotation = get_rotation(lttextlh, lttextlv, ltchar) rotation = get_rotation(chars, horizontal_text, vertical_text)
if rotation != '': if rotation != '':
fpath_new = ''.join([froot.replace('page', 'p'), '_rotated', fext]) fpath_new = ''.join([froot.replace('page', 'p'), '_rotated', fext])
os.rename(fpath, fpath_new) os.rename(fpath, fpath_new)

View File

@ -13,7 +13,8 @@ class BaseParser(object):
self.layout_kwargs = layout_kwargs self.layout_kwargs = layout_kwargs
self.layout, self.dimensions = get_page_layout( self.layout, self.dimensions = get_page_layout(
filename, **layout_kwargs) filename, **layout_kwargs)
self.horizontal_text = get_text_objects(self.layout, ltype="lh") self.images = get_text_objects(self.layout, ltype='image')
self.vertical_text = get_text_objects(self.layout, ltype="lv") self.horizontal_text = get_text_objects(self.layout, ltype='horizontal_text')
self.vertical_text = get_text_objects(self.layout, ltype='vertical_text')
self.pdf_width, self.pdf_height = self.dimensions self.pdf_width, self.pdf_height = self.dimensions
self.rootname, __ = os.path.splitext(self.filename) self.rootname, __ = os.path.splitext(self.filename)

View File

@ -356,7 +356,11 @@ class Lattice(BaseParser):
logger.info('Processing {}'.format(os.path.basename(self.rootname))) logger.info('Processing {}'.format(os.path.basename(self.rootname)))
if not self.horizontal_text: if not self.horizontal_text:
warnings.warn("No tables found on {}".format( if self.images:
warnings.warn('{} is image-based, camelot only works on'
' text-based pages.'.format(os.path.basename(self.rootname)))
else:
warnings.warn('No tables found on {}'.format(
os.path.basename(self.rootname))) os.path.basename(self.rootname)))
return [] return []

View File

@ -395,7 +395,11 @@ class Stream(BaseParser):
logger.info('Processing {}'.format(os.path.basename(self.rootname))) logger.info('Processing {}'.format(os.path.basename(self.rootname)))
if not self.horizontal_text: if not self.horizontal_text:
warnings.warn("No tables found on {}".format( if self.images:
warnings.warn('{} is image-based, camelot only works on'
' text-based pages.'.format(os.path.basename(self.rootname)))
else:
warnings.warn('No tables found on {}'.format(
os.path.basename(self.rootname))) os.path.basename(self.rootname)))
return [] return []

View File

@ -20,7 +20,7 @@ from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfinterp import PDFPageInterpreter from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.converter import PDFPageAggregator from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import (LAParams, LTAnno, LTChar, LTTextLineHorizontal, from pdfminer.layout import (LAParams, LTAnno, LTChar, LTTextLineHorizontal,
LTTextLineVertical) LTTextLineVertical, LTImage)
PY3 = sys.version_info[0] >= 3 PY3 = sys.version_info[0] >= 3
@ -270,15 +270,15 @@ def scale_image(tables, v_segments, h_segments, factors):
return tables_new, v_segments_new, h_segments_new return tables_new, v_segments_new, h_segments_new
def get_rotation(lttextlh, lttextlv, ltchar): def get_rotation(chars, horizontal_text, vertical_text):
"""Detects if text in table is rotated or not using the current """Detects if text in table is rotated or not using the current
transformation matrix (CTM) and returns its orientation. transformation matrix (CTM) and returns its orientation.
Parameters Parameters
---------- ----------
lttextlh : list horizontal_text : list
List of PDFMiner LTTextLineHorizontal objects. List of PDFMiner LTTextLineHorizontal objects.
lttextlv : list vertical_text : list
List of PDFMiner LTTextLineVertical objects. List of PDFMiner LTTextLineVertical objects.
ltchar : list ltchar : list
List of PDFMiner LTChar objects. List of PDFMiner LTChar objects.
@ -292,11 +292,11 @@ def get_rotation(lttextlh, lttextlv, ltchar):
""" """
rotation = '' rotation = ''
hlen = len([t for t in lttextlh if t.get_text().strip()]) hlen = len([t for t in horizontal_text if t.get_text().strip()])
vlen = len([t for t in lttextlv if t.get_text().strip()]) vlen = len([t for t in vertical_text if t.get_text().strip()])
if hlen < vlen: if hlen < vlen:
clockwise = sum(t.matrix[1] < 0 and t.matrix[2] > 0 for t in ltchar) clockwise = sum(t.matrix[1] < 0 and t.matrix[2] > 0 for t in chars)
anticlockwise = sum(t.matrix[1] > 0 and t.matrix[2] < 0 for t in ltchar) anticlockwise = sum(t.matrix[1] > 0 and t.matrix[2] < 0 for t in chars)
rotation = 'anticlockwise' if clockwise < anticlockwise else 'clockwise' rotation = 'anticlockwise' if clockwise < anticlockwise else 'clockwise'
return rotation return rotation
@ -713,11 +713,13 @@ def get_text_objects(layout, ltype="char", t=None):
List of PDFMiner text objects. List of PDFMiner text objects.
""" """
if ltype == "char": if ltype == 'char':
LTObject = LTChar LTObject = LTChar
elif ltype == "lh": elif ltype == 'image':
LTObject = LTImage
elif ltype == 'horizontal_text':
LTObject = LTTextLineHorizontal LTObject = LTTextLineHorizontal
elif ltype == "lv": elif ltype == 'vertical_text':
LTObject = LTTextLineVertical LTObject = LTTextLineVertical
if t is None: if t is None:
t = [] t = []

Binary file not shown.

View File

@ -41,6 +41,15 @@ def test_stream_equal_length():
table_areas=['10,20,30,40'], columns=['10,20,30,40', '10,20,30,40']) table_areas=['10,20,30,40'], columns=['10,20,30,40', '10,20,30,40'])
def test_image_warning():
filename = os.path.join(testdir, 'image.pdf')
with warnings.catch_warnings():
warnings.simplefilter('error')
with pytest.raises(UserWarning) as e:
tables = camelot.read_pdf(filename)
assert str(e.value) == 'page-1 is image-based, camelot only works on text-based pages.'
def test_no_tables_found(): def test_no_tables_found():
filename = os.path.join(testdir, 'blank.pdf') filename = os.path.join(testdir, 'blank.pdf')
with warnings.catch_warnings(): with warnings.catch_warnings():