Merge pull request #240 from socialcopsdev/raise-image-warning

[MRG] Add warning if PDF page is image-based
2019-01-03 16:31:43 +05:30
parent 7a0acd7929 605ffdd444
commit 99eee608d7
7 changed files with 41 additions and 21 deletions
@@ -107,10 +107,10 @@ class PDFHandler(object):
                outfile.write(f)
            layout, dim = get_page_layout(fpath)
            # fix rotated PDF
-            lttextlh = get_text_objects(layout, ltype="lh")
-            lttextlv = get_text_objects(layout, ltype="lv")
-            ltchar = get_text_objects(layout, ltype="char")
-            rotation = get_rotation(lttextlh, lttextlv, ltchar)
+            chars = get_text_objects(layout, ltype="char")
+            horizontal_text = get_text_objects(layout, ltype="horizontal_text")
+            vertical_text = get_text_objects(layout, ltype="vertical_text")
+            rotation = get_rotation(chars, horizontal_text, vertical_text)
            if rotation != '':
                fpath_new = ''.join([froot.replace('page', 'p'), '_rotated', fext])
                os.rename(fpath, fpath_new)
@@ -13,7 +13,8 @@ class BaseParser(object):
        self.layout_kwargs = layout_kwargs
        self.layout, self.dimensions = get_page_layout(
            filename, **layout_kwargs)
-        self.horizontal_text = get_text_objects(self.layout, ltype="lh")
-        self.vertical_text = get_text_objects(self.layout, ltype="lv")
+        self.images = get_text_objects(self.layout, ltype='image')
+        self.horizontal_text = get_text_objects(self.layout, ltype='horizontal_text')
+        self.vertical_text = get_text_objects(self.layout, ltype='vertical_text')
        self.pdf_width, self.pdf_height = self.dimensions
        self.rootname, __ = os.path.splitext(self.filename)
@@ -356,7 +356,11 @@ class Lattice(BaseParser):
            logger.info('Processing {}'.format(os.path.basename(self.rootname)))

        if not self.horizontal_text:
-            warnings.warn("No tables found on {}".format(
+            if self.images:
+                warnings.warn('{} is image-based, camelot only works on'
+                              ' text-based pages.'.format(os.path.basename(self.rootname)))
+            else:
+                warnings.warn('No tables found on {}'.format(
                    os.path.basename(self.rootname)))
            return []

@@ -395,7 +395,11 @@ class Stream(BaseParser):
            logger.info('Processing {}'.format(os.path.basename(self.rootname)))

        if not self.horizontal_text:
-            warnings.warn("No tables found on {}".format(
+            if self.images:
+                warnings.warn('{} is image-based, camelot only works on'
+                              ' text-based pages.'.format(os.path.basename(self.rootname)))
+            else:
+                warnings.warn('No tables found on {}'.format(
                    os.path.basename(self.rootname)))
            return []

@@ -20,7 +20,7 @@ from pdfminer.pdfinterp import PDFResourceManager
 from pdfminer.pdfinterp import PDFPageInterpreter
 from pdfminer.converter import PDFPageAggregator
 from pdfminer.layout import (LAParams, LTAnno, LTChar, LTTextLineHorizontal,
-                             LTTextLineVertical)
+                             LTTextLineVertical, LTImage)


 PY3 = sys.version_info[0] >= 3
@@ -270,15 +270,15 @@ def scale_image(tables, v_segments, h_segments, factors):
    return tables_new, v_segments_new, h_segments_new


-def get_rotation(lttextlh, lttextlv, ltchar):
+def get_rotation(chars, horizontal_text, vertical_text):
    """Detects if text in table is rotated or not using the current
    transformation matrix (CTM) and returns its orientation.

    Parameters
    ----------
-    lttextlh : list
+    horizontal_text : list
        List of PDFMiner LTTextLineHorizontal objects.
-    lttextlv : list
+    vertical_text : list
        List of PDFMiner LTTextLineVertical objects.
    ltchar : list
        List of PDFMiner LTChar objects.
@@ -292,11 +292,11 @@ def get_rotation(lttextlh, lttextlv, ltchar):

    """
    rotation = ''
-    hlen = len([t for t in lttextlh if t.get_text().strip()])
-    vlen = len([t for t in lttextlv if t.get_text().strip()])
+    hlen = len([t for t in horizontal_text if t.get_text().strip()])
+    vlen = len([t for t in vertical_text if t.get_text().strip()])
    if hlen < vlen:
-        clockwise = sum(t.matrix[1] < 0 and t.matrix[2] > 0 for t in ltchar)
-        anticlockwise = sum(t.matrix[1] > 0 and t.matrix[2] < 0 for t in ltchar)
+        clockwise = sum(t.matrix[1] < 0 and t.matrix[2] > 0 for t in chars)
+        anticlockwise = sum(t.matrix[1] > 0 and t.matrix[2] < 0 for t in chars)
        rotation = 'anticlockwise' if clockwise < anticlockwise else 'clockwise'
    return rotation

@@ -713,11 +713,13 @@ def get_text_objects(layout, ltype="char", t=None):
        List of PDFMiner text objects.

    """
-    if ltype == "char":
+    if ltype == 'char':
        LTObject = LTChar
-    elif ltype == "lh":
+    elif ltype == 'image':
+        LTObject = LTImage
+    elif ltype == 'horizontal_text':
        LTObject = LTTextLineHorizontal
-    elif ltype == "lv":
+    elif ltype == 'vertical_text':
        LTObject = LTTextLineVertical
    if t is None:
        t = []
@@ -41,6 +41,15 @@ def test_stream_equal_length():
            table_areas=['10,20,30,40'], columns=['10,20,30,40', '10,20,30,40'])


+def test_image_warning():
+    filename = os.path.join(testdir, 'image.pdf')
+    with warnings.catch_warnings():
+        warnings.simplefilter('error')
+        with pytest.raises(UserWarning) as e:
+            tables = camelot.read_pdf(filename)
+        assert str(e.value) == 'page-1 is image-based, camelot only works on text-based pages.'
+
+
 def test_no_tables_found():
    filename = os.path.join(testdir, 'blank.pdf')
    with warnings.catch_warnings():