From f605bd8f94a08b18b860fab2732e93a9beaadc2a Mon Sep 17 00:00:00 2001 From: Vinayak Mehta Date: Thu, 3 Jan 2019 14:55:47 +0530 Subject: [PATCH] Fix #239 --- camelot/handlers.py | 8 ++++---- camelot/parsers/base.py | 5 +++-- camelot/parsers/lattice.py | 8 ++++++-- camelot/parsers/stream.py | 8 ++++++-- camelot/utils.py | 20 +++++++++++--------- 5 files changed, 30 insertions(+), 19 deletions(-) diff --git a/camelot/handlers.py b/camelot/handlers.py index 8f0cbcb..4955f03 100644 --- a/camelot/handlers.py +++ b/camelot/handlers.py @@ -107,10 +107,10 @@ class PDFHandler(object): outfile.write(f) layout, dim = get_page_layout(fpath) # fix rotated PDF - lttextlh = get_text_objects(layout, ltype="lh") - lttextlv = get_text_objects(layout, ltype="lv") - ltchar = get_text_objects(layout, ltype="char") - rotation = get_rotation(lttextlh, lttextlv, ltchar) + chars = get_text_objects(layout, ltype="char") + horizontal_text = get_text_objects(layout, ltype="horizontal_text") + vertical_text = get_text_objects(layout, ltype="vertical_text") + rotation = get_rotation(chars, horizontal_text, vertical_text) if rotation != '': fpath_new = ''.join([froot.replace('page', 'p'), '_rotated', fext]) os.rename(fpath, fpath_new) diff --git a/camelot/parsers/base.py b/camelot/parsers/base.py index a3280de..a20cd5e 100644 --- a/camelot/parsers/base.py +++ b/camelot/parsers/base.py @@ -13,7 +13,8 @@ class BaseParser(object): self.layout_kwargs = layout_kwargs self.layout, self.dimensions = get_page_layout( filename, **layout_kwargs) - self.horizontal_text = get_text_objects(self.layout, ltype="lh") - self.vertical_text = get_text_objects(self.layout, ltype="lv") + self.images = get_text_objects(self.layout, ltype='image') + self.horizontal_text = get_text_objects(self.layout, ltype='horizontal_text') + self.vertical_text = get_text_objects(self.layout, ltype='vertical_text') self.pdf_width, self.pdf_height = self.dimensions self.rootname, __ = os.path.splitext(self.filename) diff --git a/camelot/parsers/lattice.py b/camelot/parsers/lattice.py index 50d8e1f..ca3731d 100644 --- a/camelot/parsers/lattice.py +++ b/camelot/parsers/lattice.py @@ -356,8 +356,12 @@ class Lattice(BaseParser): logger.info('Processing {}'.format(os.path.basename(self.rootname))) if not self.horizontal_text: - warnings.warn("No tables found on {}".format( - os.path.basename(self.rootname))) + if self.images: + warnings.warn('The page is image-based, Camelot only works with' + ' text-based PDF pages.'.format(os.path.basename(self.rootname))) + else: + warnings.warn('No tables found on {}'.format( + os.path.basename(self.rootname))) return [] self._generate_image() diff --git a/camelot/parsers/stream.py b/camelot/parsers/stream.py index 049bc9f..83e2598 100644 --- a/camelot/parsers/stream.py +++ b/camelot/parsers/stream.py @@ -395,8 +395,12 @@ class Stream(BaseParser): logger.info('Processing {}'.format(os.path.basename(self.rootname))) if not self.horizontal_text: - warnings.warn("No tables found on {}".format( - os.path.basename(self.rootname))) + if self.images: + warnings.warn('The page is image-based, Camelot only works with' + ' text-based PDF pages.'.format(os.path.basename(self.rootname))) + else: + warnings.warn('No tables found on {}'.format( + os.path.basename(self.rootname))) return [] self._generate_table_bbox() diff --git a/camelot/utils.py b/camelot/utils.py index 8a95767..d35cab7 100644 --- a/camelot/utils.py +++ b/camelot/utils.py @@ -20,7 +20,7 @@ from pdfminer.pdfinterp import PDFResourceManager from pdfminer.pdfinterp import PDFPageInterpreter from pdfminer.converter import PDFPageAggregator from pdfminer.layout import (LAParams, LTAnno, LTChar, LTTextLineHorizontal, - LTTextLineVertical) + LTTextLineVertical, LTImage) PY3 = sys.version_info[0] >= 3 @@ -270,15 +270,15 @@ def scale_image(tables, v_segments, h_segments, factors): return tables_new, v_segments_new, h_segments_new -def get_rotation(lttextlh, lttextlv, ltchar): +def get_rotation(chars, horizontal_text, vertical_text): """Detects if text in table is rotated or not using the current transformation matrix (CTM) and returns its orientation. Parameters ---------- - lttextlh : list + horizontal_text : list List of PDFMiner LTTextLineHorizontal objects. - lttextlv : list + vertical_text : list List of PDFMiner LTTextLineVertical objects. ltchar : list List of PDFMiner LTChar objects. @@ -292,8 +292,8 @@ def get_rotation(lttextlh, lttextlv, ltchar): """ rotation = '' - hlen = len([t for t in lttextlh if t.get_text().strip()]) - vlen = len([t for t in lttextlv if t.get_text().strip()]) + hlen = len([t for t in horizontal_text if t.get_text().strip()]) + vlen = len([t for t in vertical_text if t.get_text().strip()]) if hlen < vlen: clockwise = sum(t.matrix[1] < 0 and t.matrix[2] > 0 for t in ltchar) anticlockwise = sum(t.matrix[1] > 0 and t.matrix[2] < 0 for t in ltchar) @@ -713,11 +713,13 @@ def get_text_objects(layout, ltype="char", t=None): List of PDFMiner text objects. """ - if ltype == "char": + if ltype == 'char': LTObject = LTChar - elif ltype == "lh": + elif ltype == 'image': + LTObject = LTImage + elif ltype == 'horizontal_text': LTObject = LTTextLineHorizontal - elif ltype == "lv": + elif ltype == 'vertical_text': LTObject = LTTextLineVertical if t is None: t = []