Merge pull request #240 from socialcopsdev/raise-image-warning
[MRG] Add warning if PDF page is image-basedpull/2/head
commit
99eee608d7
|
|
@ -107,10 +107,10 @@ class PDFHandler(object):
|
||||||
outfile.write(f)
|
outfile.write(f)
|
||||||
layout, dim = get_page_layout(fpath)
|
layout, dim = get_page_layout(fpath)
|
||||||
# fix rotated PDF
|
# fix rotated PDF
|
||||||
lttextlh = get_text_objects(layout, ltype="lh")
|
chars = get_text_objects(layout, ltype="char")
|
||||||
lttextlv = get_text_objects(layout, ltype="lv")
|
horizontal_text = get_text_objects(layout, ltype="horizontal_text")
|
||||||
ltchar = get_text_objects(layout, ltype="char")
|
vertical_text = get_text_objects(layout, ltype="vertical_text")
|
||||||
rotation = get_rotation(lttextlh, lttextlv, ltchar)
|
rotation = get_rotation(chars, horizontal_text, vertical_text)
|
||||||
if rotation != '':
|
if rotation != '':
|
||||||
fpath_new = ''.join([froot.replace('page', 'p'), '_rotated', fext])
|
fpath_new = ''.join([froot.replace('page', 'p'), '_rotated', fext])
|
||||||
os.rename(fpath, fpath_new)
|
os.rename(fpath, fpath_new)
|
||||||
|
|
|
||||||
|
|
@ -13,7 +13,8 @@ class BaseParser(object):
|
||||||
self.layout_kwargs = layout_kwargs
|
self.layout_kwargs = layout_kwargs
|
||||||
self.layout, self.dimensions = get_page_layout(
|
self.layout, self.dimensions = get_page_layout(
|
||||||
filename, **layout_kwargs)
|
filename, **layout_kwargs)
|
||||||
self.horizontal_text = get_text_objects(self.layout, ltype="lh")
|
self.images = get_text_objects(self.layout, ltype='image')
|
||||||
self.vertical_text = get_text_objects(self.layout, ltype="lv")
|
self.horizontal_text = get_text_objects(self.layout, ltype='horizontal_text')
|
||||||
|
self.vertical_text = get_text_objects(self.layout, ltype='vertical_text')
|
||||||
self.pdf_width, self.pdf_height = self.dimensions
|
self.pdf_width, self.pdf_height = self.dimensions
|
||||||
self.rootname, __ = os.path.splitext(self.filename)
|
self.rootname, __ = os.path.splitext(self.filename)
|
||||||
|
|
|
||||||
|
|
@ -356,7 +356,11 @@ class Lattice(BaseParser):
|
||||||
logger.info('Processing {}'.format(os.path.basename(self.rootname)))
|
logger.info('Processing {}'.format(os.path.basename(self.rootname)))
|
||||||
|
|
||||||
if not self.horizontal_text:
|
if not self.horizontal_text:
|
||||||
warnings.warn("No tables found on {}".format(
|
if self.images:
|
||||||
|
warnings.warn('{} is image-based, camelot only works on'
|
||||||
|
' text-based pages.'.format(os.path.basename(self.rootname)))
|
||||||
|
else:
|
||||||
|
warnings.warn('No tables found on {}'.format(
|
||||||
os.path.basename(self.rootname)))
|
os.path.basename(self.rootname)))
|
||||||
return []
|
return []
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -395,7 +395,11 @@ class Stream(BaseParser):
|
||||||
logger.info('Processing {}'.format(os.path.basename(self.rootname)))
|
logger.info('Processing {}'.format(os.path.basename(self.rootname)))
|
||||||
|
|
||||||
if not self.horizontal_text:
|
if not self.horizontal_text:
|
||||||
warnings.warn("No tables found on {}".format(
|
if self.images:
|
||||||
|
warnings.warn('{} is image-based, camelot only works on'
|
||||||
|
' text-based pages.'.format(os.path.basename(self.rootname)))
|
||||||
|
else:
|
||||||
|
warnings.warn('No tables found on {}'.format(
|
||||||
os.path.basename(self.rootname)))
|
os.path.basename(self.rootname)))
|
||||||
return []
|
return []
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -20,7 +20,7 @@ from pdfminer.pdfinterp import PDFResourceManager
|
||||||
from pdfminer.pdfinterp import PDFPageInterpreter
|
from pdfminer.pdfinterp import PDFPageInterpreter
|
||||||
from pdfminer.converter import PDFPageAggregator
|
from pdfminer.converter import PDFPageAggregator
|
||||||
from pdfminer.layout import (LAParams, LTAnno, LTChar, LTTextLineHorizontal,
|
from pdfminer.layout import (LAParams, LTAnno, LTChar, LTTextLineHorizontal,
|
||||||
LTTextLineVertical)
|
LTTextLineVertical, LTImage)
|
||||||
|
|
||||||
|
|
||||||
PY3 = sys.version_info[0] >= 3
|
PY3 = sys.version_info[0] >= 3
|
||||||
|
|
@ -270,15 +270,15 @@ def scale_image(tables, v_segments, h_segments, factors):
|
||||||
return tables_new, v_segments_new, h_segments_new
|
return tables_new, v_segments_new, h_segments_new
|
||||||
|
|
||||||
|
|
||||||
def get_rotation(lttextlh, lttextlv, ltchar):
|
def get_rotation(chars, horizontal_text, vertical_text):
|
||||||
"""Detects if text in table is rotated or not using the current
|
"""Detects if text in table is rotated or not using the current
|
||||||
transformation matrix (CTM) and returns its orientation.
|
transformation matrix (CTM) and returns its orientation.
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
lttextlh : list
|
horizontal_text : list
|
||||||
List of PDFMiner LTTextLineHorizontal objects.
|
List of PDFMiner LTTextLineHorizontal objects.
|
||||||
lttextlv : list
|
vertical_text : list
|
||||||
List of PDFMiner LTTextLineVertical objects.
|
List of PDFMiner LTTextLineVertical objects.
|
||||||
ltchar : list
|
ltchar : list
|
||||||
List of PDFMiner LTChar objects.
|
List of PDFMiner LTChar objects.
|
||||||
|
|
@ -292,11 +292,11 @@ def get_rotation(lttextlh, lttextlv, ltchar):
|
||||||
|
|
||||||
"""
|
"""
|
||||||
rotation = ''
|
rotation = ''
|
||||||
hlen = len([t for t in lttextlh if t.get_text().strip()])
|
hlen = len([t for t in horizontal_text if t.get_text().strip()])
|
||||||
vlen = len([t for t in lttextlv if t.get_text().strip()])
|
vlen = len([t for t in vertical_text if t.get_text().strip()])
|
||||||
if hlen < vlen:
|
if hlen < vlen:
|
||||||
clockwise = sum(t.matrix[1] < 0 and t.matrix[2] > 0 for t in ltchar)
|
clockwise = sum(t.matrix[1] < 0 and t.matrix[2] > 0 for t in chars)
|
||||||
anticlockwise = sum(t.matrix[1] > 0 and t.matrix[2] < 0 for t in ltchar)
|
anticlockwise = sum(t.matrix[1] > 0 and t.matrix[2] < 0 for t in chars)
|
||||||
rotation = 'anticlockwise' if clockwise < anticlockwise else 'clockwise'
|
rotation = 'anticlockwise' if clockwise < anticlockwise else 'clockwise'
|
||||||
return rotation
|
return rotation
|
||||||
|
|
||||||
|
|
@ -713,11 +713,13 @@ def get_text_objects(layout, ltype="char", t=None):
|
||||||
List of PDFMiner text objects.
|
List of PDFMiner text objects.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
if ltype == "char":
|
if ltype == 'char':
|
||||||
LTObject = LTChar
|
LTObject = LTChar
|
||||||
elif ltype == "lh":
|
elif ltype == 'image':
|
||||||
|
LTObject = LTImage
|
||||||
|
elif ltype == 'horizontal_text':
|
||||||
LTObject = LTTextLineHorizontal
|
LTObject = LTTextLineHorizontal
|
||||||
elif ltype == "lv":
|
elif ltype == 'vertical_text':
|
||||||
LTObject = LTTextLineVertical
|
LTObject = LTTextLineVertical
|
||||||
if t is None:
|
if t is None:
|
||||||
t = []
|
t = []
|
||||||
|
|
|
||||||
Binary file not shown.
|
|
@ -41,6 +41,15 @@ def test_stream_equal_length():
|
||||||
table_areas=['10,20,30,40'], columns=['10,20,30,40', '10,20,30,40'])
|
table_areas=['10,20,30,40'], columns=['10,20,30,40', '10,20,30,40'])
|
||||||
|
|
||||||
|
|
||||||
|
def test_image_warning():
|
||||||
|
filename = os.path.join(testdir, 'image.pdf')
|
||||||
|
with warnings.catch_warnings():
|
||||||
|
warnings.simplefilter('error')
|
||||||
|
with pytest.raises(UserWarning) as e:
|
||||||
|
tables = camelot.read_pdf(filename)
|
||||||
|
assert str(e.value) == 'page-1 is image-based, camelot only works on text-based pages.'
|
||||||
|
|
||||||
|
|
||||||
def test_no_tables_found():
|
def test_no_tables_found():
|
||||||
filename = os.path.join(testdir, 'blank.pdf')
|
filename = os.path.join(testdir, 'blank.pdf')
|
||||||
with warnings.catch_warnings():
|
with warnings.catch_warnings():
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue