diff --git a/HISTORY.md b/HISTORY.md index 7bfa3f0..a1e21e9 100755 --- a/HISTORY.md +++ b/HISTORY.md @@ -6,6 +6,18 @@ master **Improvements** +* [#240](https://github.com/socialcopsdev/camelot/issues/209) Add support to analyze only certain page regions to look for tables. [#243](https://github.com/socialcopsdev/camelot/pull/243) by Vinayak Mehta. + * You can use `table_regions` in `read_pdf()` to specify approximate page regions which may contain tables. + * Kwarg `line_size_scaling` is now called `line_scale`. +* [#212](https://github.com/socialcopsdev/camelot/issues/212) Add support to export as sqlite database. [#244](https://github.com/socialcopsdev/camelot/pull/244) by Vinayak Mehta. +* [#239](https://github.com/socialcopsdev/camelot/issues/239) Raise warning if PDF is image-based. [#240](https://github.com/socialcopsdev/camelot/pull/240) by Vinayak Mehta. + +0.6.0 (2018-12-24) +------------------ + +**Improvements** + +* [#91](https://github.com/socialcopsdev/camelot/issues/91) Add support to read from url. [#236](https://github.com/socialcopsdev/camelot/pull/236) by Vinayak Mehta. * [#229](https://github.com/socialcopsdev/camelot/issues/229), [#230](https://github.com/socialcopsdev/camelot/issues/230) and [#233](https://github.com/socialcopsdev/camelot/issues/233) New configuration parameters. [#234](https://github.com/socialcopsdev/camelot/pull/234) by Vinayak Mehta. * `strip_text`: To define characters that should be stripped from each string. * `edge_tol`: Tolerance parameter for extending textedges vertically. diff --git a/README.md b/README.md index 1d89c30..3f6013f 100644 --- a/README.md +++ b/README.md @@ -21,7 +21,7 @@ >>> tables = camelot.read_pdf('foo.pdf') >>> tables <TableList n=1> ->>> tables.export('foo.csv', f='csv', compress=True) # json, excel, html +>>> tables.export('foo.csv', f='csv', compress=True) # json, excel, html, sqlite >>> tables[0] <Table shape=(7, 7)> >>> tables[0].parsing_report @@ -31,7 +31,7 @@ 'order': 1, 'page': 1 } ->>> tables[0].to_csv('foo.csv') # to_json, to_excel, to_html +>>> tables[0].to_csv('foo.csv') # to_json, to_excel, to_html, to_sqlite >>> tables[0].df # get a pandas DataFrame! @@ -53,7 +53,7 @@ There's a [command-line interface](https://camelot-py.readthedocs.io/en/master/u - **You are in control.**: Unlike other libraries and tools which either give a nice output or fail miserably (with no in-between), Camelot gives you the power to tweak table extraction. (This is important since everything in the real world, including PDF table extraction, is fuzzy.) - *Bad* tables can be discarded based on **metrics** like accuracy and whitespace, without ever having to manually look at each table. - Each table is a **pandas DataFrame**, which seamlessly integrates into [ETL and data analysis workflows](https://gist.github.com/vinayak-mehta/e5949f7c2410a0e12f25d3682dc9e873). -- **Export** to multiple formats, including JSON, Excel and HTML. +- **Export** to multiple formats, including JSON, Excel, HTML and Sqlite. See [comparison with other PDF table extraction libraries and tools](https://github.com/socialcopsdev/camelot/wiki/Comparison-with-other-PDF-Table-Extraction-libraries-and-tools). diff --git a/camelot/__version__.py b/camelot/__version__.py index 3f619b1..b5e80e8 100644 --- a/camelot/__version__.py +++ b/camelot/__version__.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -VERSION = (0, 5, 0) +VERSION = (0, 6, 0) PRERELEASE = None # alpha, beta or rc REVISION = None diff --git a/camelot/cli.py b/camelot/cli.py index a1a571e..b661555 100644 --- a/camelot/cli.py +++ b/camelot/cli.py @@ -32,11 +32,11 @@ pass_config = click.make_pass_decorator(Config) @click.version_option(version=__version__) @click.option('-q', '--quiet', is_flag=False, help='Suppress logs and warnings.') @click.option('-p', '--pages', default='1', help='Comma-separated page numbers.' - ' Example: 1,3,4 or 1,4-end.') + ' Example: 1,3,4 or 1,4-end or all.') @click.option('-pw', '--password', help='Password for decryption.') @click.option('-o', '--output', help='Output file path.') @click.option('-f', '--format', - type=click.Choice(['csv', 'json', 'excel', 'html']), + type=click.Choice(['csv', 'json', 'excel', 'html', 'sqlite']), help='Output file format.') @click.option('-z', '--zip', is_flag=True, help='Create ZIP archive.') @click.option('-split', '--split_text', is_flag=True, @@ -56,12 +56,15 @@ def cli(ctx, *args, **kwargs): @cli.command('lattice') +@click.option('-R', '--table_regions', default=[], multiple=True, + help='Page regions to analyze. Example: x1,y1,x2,y2' + ' where x1, y1 -> left-top and x2, y2 -> right-bottom.') @click.option('-T', '--table_areas', default=[], multiple=True, help='Table areas to process. Example: x1,y1,x2,y2' ' where x1, y1 -> left-top and x2, y2 -> right-bottom.') @click.option('-back', '--process_background', is_flag=True, help='Process background lines.') -@click.option('-scale', '--line_size_scaling', default=15, +@click.option('-scale', '--line_scale', default=15, help='Line size scaling factor. The larger the value,' ' the smaller the detected lines.') @click.option('-copy', '--copy_text', default=[], type=click.Choice(['h', 'v']), @@ -105,6 +108,8 @@ def lattice(c, *args, **kwargs): filepath = kwargs.pop('filepath') kwargs.update(conf) + table_regions = list(kwargs['table_regions']) + kwargs['table_regions'] = None if not table_regions else table_regions table_areas = list(kwargs['table_areas']) kwargs['table_areas'] = None if not table_areas else table_areas copy_text = list(kwargs['copy_text']) @@ -132,6 +137,9 @@ def lattice(c, *args, **kwargs): @cli.command('stream') +@click.option('-R', '--table_regions', default=[], multiple=True, + help='Page regions to analyze. Example: x1,y1,x2,y2' + ' where x1, y1 -> left-top and x2, y2 -> right-bottom.') @click.option('-T', '--table_areas', default=[], multiple=True, help='Table areas to process. Example: x1,y1,x2,y2' ' where x1, y1 -> left-top and x2, y2 -> right-bottom.') @@ -160,6 +168,8 @@ def stream(c, *args, **kwargs): filepath = kwargs.pop('filepath') kwargs.update(conf) + table_regions = list(kwargs['table_regions']) + kwargs['table_regions'] = None if not table_regions else table_regions table_areas = list(kwargs['table_areas']) kwargs['table_areas'] = None if not table_areas else table_areas columns = list(kwargs['columns']) diff --git a/camelot/core.py b/camelot/core.py index 4e5869a..e82a11f 100644 --- a/camelot/core.py +++ b/camelot/core.py @@ -1,6 +1,7 @@ # -*- coding: utf-8 -*- import os +import sqlite3 import zipfile import tempfile from itertools import chain @@ -592,6 +593,28 @@ class Table(object): with open(path, 'w') as f: f.write(html_string) + def to_sqlite(self, path, **kwargs): + """Writes Table to sqlite database. + + For kwargs, check :meth:`pandas.DataFrame.to_sql`. + + Parameters + ---------- + path : str + Output filepath. + + """ + kw = { + 'if_exists': 'replace', + 'index': False + } + kw.update(kwargs) + conn = sqlite3.connect(path) + table_name = 'page-{}-table-{}'.format(self.page, self.order) + self.df.to_sql(table_name, conn, **kw) + conn.commit() + conn.close() + class TableList(object): """Defines a list of camelot.core.Table objects. Each table can @@ -656,7 +679,7 @@ class TableList(object): path : str Output filepath. f : str - File format. Can be csv, json, excel and html. + File format. Can be csv, json, excel, html and sqlite. compress : bool Whether or not to add files to a ZIP archive. @@ -689,3 +712,11 @@ class TableList(object): zipname = os.path.join(os.path.dirname(path), root) + '.zip' with zipfile.ZipFile(zipname, 'w', allowZip64=True) as z: z.write(filepath, os.path.basename(filepath)) + elif f == 'sqlite': + filepath = os.path.join(dirname, basename) + for table in self._tables: + table.to_sqlite(filepath) + if compress: + zipname = os.path.join(os.path.dirname(path), root) + '.zip' + with zipfile.ZipFile(zipname, 'w', allowZip64=True) as z: + z.write(filepath, os.path.basename(filepath)) diff --git a/camelot/handlers.py b/camelot/handlers.py index 35708ee..4955f03 100644 --- a/camelot/handlers.py +++ b/camelot/handlers.py @@ -8,7 +8,7 @@ from PyPDF2 import PdfFileReader, PdfFileWriter from .core import TableList from .parsers import Stream, Lattice from .utils import (TemporaryDirectory, get_page_layout, get_text_objects, - get_rotation) + get_rotation, is_url, download_url) class PDFHandler(object): @@ -18,20 +18,22 @@ class PDFHandler(object): Parameters ---------- - filename : str - Path to PDF file. + filepath : str + Filepath or URL of the PDF file. pages : str, optional (default: '1') Comma-separated page numbers. - Example: '1,3,4' or '1,4-end'. + Example: '1,3,4' or '1,4-end' or 'all'. password : str, optional (default: None) Password for decryption. """ - def __init__(self, filename, pages='1', password=None): - self.filename = filename - if not filename.lower().endswith('.pdf'): + def __init__(self, filepath, pages='1', password=None): + if is_url(filepath): + filepath = download_url(filepath) + self.filepath = filepath + if not filepath.lower().endswith('.pdf'): raise NotImplementedError("File format not supported") - self.pages = self._get_pages(self.filename, pages) + self.pages = self._get_pages(self.filepath, pages) if password is None: self.password = '' else: @@ -39,16 +41,16 @@ class PDFHandler(object): if sys.version_info[0] < 3: self.password = self.password.encode('ascii') - def _get_pages(self, filename, pages): + def _get_pages(self, filepath, pages): """Converts pages string to list of ints. Parameters ---------- - filename : str - Path to PDF file. + filepath : str + Filepath or URL of the PDF file. pages : str, optional (default: '1') Comma-separated page numbers. - Example: 1,3,4 or 1,4-end. + Example: '1,3,4' or '1,4-end' or 'all'. Returns ------- @@ -60,7 +62,7 @@ class PDFHandler(object): if pages == '1': page_numbers.append({'start': 1, 'end': 1}) else: - infile = PdfFileReader(open(filename, 'rb'), strict=False) + infile = PdfFileReader(open(filepath, 'rb'), strict=False) if infile.isEncrypted: infile.decrypt(self.password) if pages == 'all': @@ -79,20 +81,20 @@ class PDFHandler(object): P.extend(range(p['start'], p['end'] + 1)) return sorted(set(P)) - def _save_page(self, filename, page, temp): + def _save_page(self, filepath, page, temp): """Saves specified page from PDF into a temporary directory. Parameters ---------- - filename : str - Path to PDF file. + filepath : str + Filepath or URL of the PDF file. page : int Page number. temp : str Tmp directory. """ - with open(filename, 'rb') as fileobj: + with open(filepath, 'rb') as fileobj: infile = PdfFileReader(fileobj, strict=False) if infile.isEncrypted: infile.decrypt(self.password) @@ -105,10 +107,10 @@ class PDFHandler(object): outfile.write(f) layout, dim = get_page_layout(fpath) # fix rotated PDF - lttextlh = get_text_objects(layout, ltype="lh") - lttextlv = get_text_objects(layout, ltype="lv") - ltchar = get_text_objects(layout, ltype="char") - rotation = get_rotation(lttextlh, lttextlv, ltchar) + chars = get_text_objects(layout, ltype="char") + horizontal_text = get_text_objects(layout, ltype="horizontal_text") + vertical_text = get_text_objects(layout, ltype="vertical_text") + rotation = get_rotation(chars, horizontal_text, vertical_text) if rotation != '': fpath_new = ''.join([froot.replace('page', 'p'), '_rotated', fext]) os.rename(fpath, fpath_new) @@ -150,7 +152,7 @@ class PDFHandler(object): tables = [] with TemporaryDirectory() as tempdir: for p in self.pages: - self._save_page(self.filename, p, tempdir) + self._save_page(self.filepath, p, tempdir) pages = [os.path.join(tempdir, 'page-{0}.pdf'.format(p)) for p in self.pages] parser = Lattice(**kwargs) if flavor == 'lattice' else Stream(**kwargs) diff --git a/camelot/image_processing.py b/camelot/image_processing.py index eb23101..3051852 100644 --- a/camelot/image_processing.py +++ b/camelot/image_processing.py @@ -48,7 +48,8 @@ def adaptive_threshold(imagename, process_background=False, blocksize=15, c=-2): return img, threshold -def find_lines(threshold, direction='horizontal', line_size_scaling=15, iterations=0): +def find_lines(threshold, regions=None, direction='horizontal', + line_scale=15, iterations=0): """Finds horizontal and vertical lines by applying morphological transformations on an image. @@ -56,9 +57,13 @@ def find_lines(threshold, direction='horizontal', line_size_scaling=15, iteratio ---------- threshold : object numpy.ndarray representing the thresholded image. + regions : list, optional (default: None) + List of page regions that may contain tables of the form x1,y1,x2,y2 + where (x1, y1) -> left-top and (x2, y2) -> right-bottom + in image coordinate space. direction : string, optional (default: 'horizontal') Specifies whether to find vertical or horizontal lines. - line_size_scaling : int, optional (default: 15) + line_scale : int, optional (default: 15) Factor by which the page dimensions will be divided to get smallest length of lines that should be detected. @@ -83,26 +88,33 @@ def find_lines(threshold, direction='horizontal', line_size_scaling=15, iteratio lines = [] if direction == 'vertical': - size = threshold.shape[0] // line_size_scaling + size = threshold.shape[0] // line_scale el = cv2.getStructuringElement(cv2.MORPH_RECT, (1, size)) elif direction == 'horizontal': - size = threshold.shape[1] // line_size_scaling + size = threshold.shape[1] // line_scale el = cv2.getStructuringElement(cv2.MORPH_RECT, (size, 1)) elif direction is None: raise ValueError("Specify direction as either 'vertical' or" " 'horizontal'") + if regions is not None: + region_mask = np.zeros(threshold.shape) + for region in regions: + x, y, w, h = region + region_mask[y : y + h, x : x + w] = 1 + threshold = np.multiply(threshold, region_mask) + threshold = cv2.erode(threshold, el) threshold = cv2.dilate(threshold, el) dmask = cv2.dilate(threshold, el, iterations=iterations) try: _, contours, _ = cv2.findContours( - threshold, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) + threshold.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) except ValueError: # for opencv backward compatibility contours, _ = cv2.findContours( - threshold, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) + threshold.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) for c in contours: x, y, w, h = cv2.boundingRect(c) @@ -116,7 +128,7 @@ def find_lines(threshold, direction='horizontal', line_size_scaling=15, iteratio return dmask, lines -def find_table_contours(vertical, horizontal): +def find_contours(vertical, horizontal): """Finds table boundaries using OpenCV's findContours. Parameters @@ -138,11 +150,12 @@ def find_table_contours(vertical, horizontal): try: __, contours, __ = cv2.findContours( - mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) + mask.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) except ValueError: # for opencv backward compatibility contours, __ = cv2.findContours( - mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) + mask.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) + # sort in reverse based on contour area and use first 10 contours contours = sorted(contours, key=cv2.contourArea, reverse=True)[:10] cont = [] @@ -153,7 +166,7 @@ def find_table_contours(vertical, horizontal): return cont -def find_table_joints(contours, vertical, horizontal): +def find_joints(contours, vertical, horizontal): """Finds joints/intersections present inside each table boundary. Parameters @@ -176,18 +189,18 @@ def find_table_joints(contours, vertical, horizontal): and (x2, y2) -> rt in image coordinate space. """ - joints = np.bitwise_and(vertical, horizontal) + joints = np.multiply(vertical, horizontal) tables = {} for c in contours: x, y, w, h = c roi = joints[y : y + h, x : x + w] try: __, jc, __ = cv2.findContours( - roi, cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE) + roi.astype(np.uint8), cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE) except ValueError: # for opencv backward compatibility jc, __ = cv2.findContours( - roi, cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE) + roi.astype(np.uint8), cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE) if len(jc) <= 4: # remove contours with less than 4 joints continue joint_coords = [] diff --git a/camelot/io.py b/camelot/io.py index 96ffa27..5162dd2 100644 --- a/camelot/io.py +++ b/camelot/io.py @@ -1,4 +1,5 @@ # -*- coding: utf-8 -*- + import warnings from .handlers import PDFHandler @@ -15,10 +16,10 @@ def read_pdf(filepath, pages='1', password=None, flavor='lattice', Parameters ---------- filepath : str - Path to PDF file. + Filepath or URL of the PDF file. pages : str, optional (default: '1') Comma-separated page numbers. - Example: '1,3,4' or '1,4-end'. + Example: '1,3,4' or '1,4-end' or 'all'. password : str, optional (default: None) Password for decryption. flavor : str (default: 'lattice') @@ -51,7 +52,7 @@ def read_pdf(filepath, pages='1', password=None, flavor='lattice', to generate columns. process_background* : bool, optional (default: False) Process background lines. - line_size_scaling* : int, optional (default: 15) + line_scale* : int, optional (default: 15) Line size scaling factor. The larger the value the smaller the detected lines. Making it very large will lead to text being detected as lines. diff --git a/camelot/parsers/base.py b/camelot/parsers/base.py index a3280de..a20cd5e 100644 --- a/camelot/parsers/base.py +++ b/camelot/parsers/base.py @@ -13,7 +13,8 @@ class BaseParser(object): self.layout_kwargs = layout_kwargs self.layout, self.dimensions = get_page_layout( filename, **layout_kwargs) - self.horizontal_text = get_text_objects(self.layout, ltype="lh") - self.vertical_text = get_text_objects(self.layout, ltype="lv") + self.images = get_text_objects(self.layout, ltype='image') + self.horizontal_text = get_text_objects(self.layout, ltype='horizontal_text') + self.vertical_text = get_text_objects(self.layout, ltype='vertical_text') self.pdf_width, self.pdf_height = self.dimensions self.rootname, __ = os.path.splitext(self.filename) diff --git a/camelot/parsers/lattice.py b/camelot/parsers/lattice.py index 1924d84..c01727c 100644 --- a/camelot/parsers/lattice.py +++ b/camelot/parsers/lattice.py @@ -19,7 +19,7 @@ from ..utils import (scale_image, scale_pdf, segments_in_bbox, text_in_bbox, merge_close_lines, get_table_index, compute_accuracy, compute_whitespace) from ..image_processing import (adaptive_threshold, find_lines, - find_table_contours, find_table_joints) + find_contours, find_joints) logger = logging.getLogger('camelot') @@ -31,13 +31,17 @@ class Lattice(BaseParser): Parameters ---------- + table_regions : list, optional (default: None) + List of page regions that may contain tables of the form x1,y1,x2,y2 + where (x1, y1) -> left-top and (x2, y2) -> right-bottom + in PDF coordinate space. table_areas : list, optional (default: None) List of table area strings of the form x1,y1,x2,y2 where (x1, y1) -> left-top and (x2, y2) -> right-bottom in PDF coordinate space. process_background : bool, optional (default: False) Process background lines. - line_size_scaling : int, optional (default: 15) + line_scale : int, optional (default: 15) Line size scaling factor. The larger the value the smaller the detected lines. Making it very large will lead to text being detected as lines. @@ -80,14 +84,15 @@ class Lattice(BaseParser): Resolution used for PDF to PNG conversion. """ - def __init__(self, table_areas=None, process_background=False, - line_size_scaling=15, copy_text=None, shift_text=['l', 't'], + def __init__(self, table_regions=None, table_areas=None, process_background=False, + line_scale=15, copy_text=None, shift_text=['l', 't'], split_text=False, flag_size=False, strip_text='', line_tol=2, joint_tol=2, threshold_blocksize=15, threshold_constant=-2, iterations=0, resolution=300, **kwargs): + self.table_regions = table_regions self.table_areas = table_areas self.process_background = process_background - self.line_size_scaling = line_size_scaling + self.line_scale = line_scale self.copy_text = copy_text self.shift_text = shift_text self.split_text = split_text @@ -189,9 +194,22 @@ class Lattice(BaseParser): null.close() def _generate_table_bbox(self): + def scale_areas(areas): + scaled_areas = [] + for area in areas: + x1, y1, x2, y2 = area.split(",") + x1 = float(x1) + y1 = float(y1) + x2 = float(x2) + y2 = float(y2) + x1, y1, x2, y2 = scale_pdf((x1, y1, x2, y2), image_scalers) + scaled_areas.append((x1, y1, abs(x2 - x1), abs(y2 - y1))) + return scaled_areas + self.image, self.threshold = adaptive_threshold( self.imagename, process_background=self.process_background, blocksize=self.threshold_blocksize, c=self.threshold_constant) + image_width = self.image.shape[1] image_height = self.image.shape[0] image_width_scaler = image_width / float(self.pdf_width) @@ -201,27 +219,30 @@ class Lattice(BaseParser): image_scalers = (image_width_scaler, image_height_scaler, self.pdf_height) pdf_scalers = (pdf_width_scaler, pdf_height_scaler, image_height) - vertical_mask, vertical_segments = find_lines( - self.threshold, direction='vertical', - line_size_scaling=self.line_size_scaling, iterations=self.iterations) - horizontal_mask, horizontal_segments = find_lines( - self.threshold, direction='horizontal', - line_size_scaling=self.line_size_scaling, iterations=self.iterations) + if self.table_areas is None: + regions = None + if self.table_regions is not None: + regions = scale_areas(self.table_regions) - if self.table_areas is not None: - areas = [] - for area in self.table_areas: - x1, y1, x2, y2 = area.split(",") - x1 = float(x1) - y1 = float(y1) - x2 = float(x2) - y2 = float(y2) - x1, y1, x2, y2 = scale_pdf((x1, y1, x2, y2), image_scalers) - areas.append((x1, y1, abs(x2 - x1), abs(y2 - y1))) - table_bbox = find_table_joints(areas, vertical_mask, horizontal_mask) + vertical_mask, vertical_segments = find_lines( + self.threshold, regions=regions, direction='vertical', + line_scale=self.line_scale, iterations=self.iterations) + horizontal_mask, horizontal_segments = find_lines( + self.threshold, regions=regions, direction='horizontal', + line_scale=self.line_scale, iterations=self.iterations) + + contours = find_contours(vertical_mask, horizontal_mask) + table_bbox = find_joints(contours, vertical_mask, horizontal_mask) else: - contours = find_table_contours(vertical_mask, horizontal_mask) - table_bbox = find_table_joints(contours, vertical_mask, horizontal_mask) + vertical_mask, vertical_segments = find_lines( + self.threshold, direction='vertical', line_scale=self.line_scale, + iterations=self.iterations) + horizontal_mask, horizontal_segments = find_lines( + self.threshold, direction='horizontal', line_scale=self.line_scale, + iterations=self.iterations) + + areas = scale_areas(self.table_areas) + table_bbox = find_joints(areas, vertical_mask, horizontal_mask) self.table_bbox_unscaled = copy.deepcopy(table_bbox) @@ -318,8 +339,12 @@ class Lattice(BaseParser): logger.info('Processing {}'.format(os.path.basename(self.rootname))) if not self.horizontal_text: - warnings.warn("No tables found on {}".format( - os.path.basename(self.rootname))) + if self.images: + warnings.warn('{} is image-based, camelot only works on' + ' text-based pages.'.format(os.path.basename(self.rootname))) + else: + warnings.warn('No tables found on {}'.format( + os.path.basename(self.rootname))) return [] self._generate_image() diff --git a/camelot/parsers/stream.py b/camelot/parsers/stream.py index 049bc9f..1efe144 100644 --- a/camelot/parsers/stream.py +++ b/camelot/parsers/stream.py @@ -26,6 +26,10 @@ class Stream(BaseParser): Parameters ---------- + table_regions : list, optional (default: None) + List of page regions that may contain tables of the form x1,y1,x2,y2 + where (x1, y1) -> left-top and (x2, y2) -> right-bottom + in PDF coordinate space. table_areas : list, optional (default: None) List of table area strings of the form x1,y1,x2,y2 where (x1, y1) -> left-top and (x2, y2) -> right-bottom @@ -51,9 +55,10 @@ class Stream(BaseParser): to generate columns. """ - def __init__(self, table_areas=None, columns=None, split_text=False, + def __init__(self, table_regions=None, table_areas=None, columns=None, split_text=False, flag_size=False, strip_text='', edge_tol=50, row_tol=2, column_tol=0, **kwargs): + self.table_regions = table_regions self.table_areas = table_areas self.columns = columns self._validate_columns() @@ -275,7 +280,18 @@ class Stream(BaseParser): def _generate_table_bbox(self): self.textedges = [] - if self.table_areas is not None: + if self.table_areas is None: + hor_text = self.horizontal_text + if self.table_regions is not None: + # filter horizontal text + hor_text = [] + for region in self.table_regions: + x1, y1, x2, y2 = region + region_text = text_in_bbox((x1, y2, x2, y1), self.horizontal_text) + hor_text.extend(region_text) + # find tables based on nurminen's detection algorithm + table_bbox = self._nurminen_table_detection(hor_text) + else: table_bbox = {} for area in self.table_areas: x1, y1, x2, y2 = area.split(",") @@ -284,9 +300,6 @@ class Stream(BaseParser): x2 = float(x2) y2 = float(y2) table_bbox[(x1, y2, x2, y1)] = None - else: - # find tables based on nurminen's detection algorithm - table_bbox = self._nurminen_table_detection(self.horizontal_text) self.table_bbox = table_bbox def _generate_columns_and_rows(self, table_idx, tk): @@ -395,8 +408,12 @@ class Stream(BaseParser): logger.info('Processing {}'.format(os.path.basename(self.rootname))) if not self.horizontal_text: - warnings.warn("No tables found on {}".format( - os.path.basename(self.rootname))) + if self.images: + warnings.warn('{} is image-based, camelot only works on' + ' text-based pages.'.format(os.path.basename(self.rootname))) + else: + warnings.warn('No tables found on {}'.format( + os.path.basename(self.rootname))) return [] self._generate_table_bbox() diff --git a/camelot/utils.py b/camelot/utils.py index 88564f7..7b22307 100644 --- a/camelot/utils.py +++ b/camelot/utils.py @@ -1,12 +1,17 @@ +# -*- coding: utf-8 -*- from __future__ import division + +import os +import sys +import random import shutil +import string import tempfile import warnings from itertools import groupby from operator import itemgetter import numpy as np - from pdfminer.pdfparser import PDFParser from pdfminer.pdfdocument import PDFDocument from pdfminer.pdfpage import PDFPage @@ -15,7 +20,78 @@ from pdfminer.pdfinterp import PDFResourceManager from pdfminer.pdfinterp import PDFPageInterpreter from pdfminer.converter import PDFPageAggregator from pdfminer.layout import (LAParams, LTAnno, LTChar, LTTextLineHorizontal, - LTTextLineVertical) + LTTextLineVertical, LTImage) + + +PY3 = sys.version_info[0] >= 3 +if PY3: + from urllib.request import urlopen + from urllib.parse import urlparse as parse_url + from urllib.parse import uses_relative, uses_netloc, uses_params +else: + from urllib2 import urlopen + from urlparse import urlparse as parse_url + from urlparse import uses_relative, uses_netloc, uses_params + + +_VALID_URLS = set(uses_relative + uses_netloc + uses_params) +_VALID_URLS.discard('') + + +# https://github.com/pandas-dev/pandas/blob/master/pandas/io/common.py +def is_url(url): + """Check to see if a URL has a valid protocol. + + Parameters + ---------- + url : str or unicode + + Returns + ------- + isurl : bool + If url has a valid protocol return True otherwise False. + + """ + try: + return parse_url(url).scheme in _VALID_URLS + except Exception: + return False + + +def random_string(length): + ret = '' + while length: + ret += random.choice(string.digits + string.ascii_lowercase + string.ascii_uppercase) + length -= 1 + return ret + + +def download_url(url): + """Download file from specified URL. + + Parameters + ---------- + url : str or unicode + + Returns + ------- + filepath : str or unicode + Temporary filepath. + + """ + filename = '{}.pdf'.format(random_string(6)) + with tempfile.NamedTemporaryFile('wb', delete=False) as f: + obj = urlopen(url) + if PY3: + content_type = obj.info().get_content_type() + else: + content_type = obj.info().getheader('Content-Type') + if content_type != 'application/pdf': + raise NotImplementedError("File format not supported") + f.write(obj.read()) + filepath = os.path.join(os.path.dirname(f.name), filename) + shutil.move(f.name, filepath) + return filepath stream_kwargs = [ @@ -25,7 +101,7 @@ stream_kwargs = [ ] lattice_kwargs = [ 'process_background', - 'line_size_scaling', + 'line_scale', 'copy_text', 'shift_text', 'line_tol', @@ -194,15 +270,15 @@ def scale_image(tables, v_segments, h_segments, factors): return tables_new, v_segments_new, h_segments_new -def get_rotation(lttextlh, lttextlv, ltchar): +def get_rotation(chars, horizontal_text, vertical_text): """Detects if text in table is rotated or not using the current transformation matrix (CTM) and returns its orientation. Parameters ---------- - lttextlh : list + horizontal_text : list List of PDFMiner LTTextLineHorizontal objects. - lttextlv : list + vertical_text : list List of PDFMiner LTTextLineVertical objects. ltchar : list List of PDFMiner LTChar objects. @@ -216,11 +292,11 @@ def get_rotation(lttextlh, lttextlv, ltchar): """ rotation = '' - hlen = len([t for t in lttextlh if t.get_text().strip()]) - vlen = len([t for t in lttextlv if t.get_text().strip()]) + hlen = len([t for t in horizontal_text if t.get_text().strip()]) + vlen = len([t for t in vertical_text if t.get_text().strip()]) if hlen < vlen: - clockwise = sum(t.matrix[1] < 0 and t.matrix[2] > 0 for t in ltchar) - anticlockwise = sum(t.matrix[1] > 0 and t.matrix[2] < 0 for t in ltchar) + clockwise = sum(t.matrix[1] < 0 and t.matrix[2] > 0 for t in chars) + anticlockwise = sum(t.matrix[1] > 0 and t.matrix[2] < 0 for t in chars) rotation = 'anticlockwise' if clockwise < anticlockwise else 'clockwise' return rotation @@ -263,7 +339,7 @@ def text_in_bbox(bbox, text): ---------- bbox : tuple Tuple (x1, y1, x2, y2) representing a bounding box where - (x1, y1) -> lb and (x2, y2) -> rt in PDFMiner coordinate + (x1, y1) -> lb and (x2, y2) -> rt in the PDF coordinate space. text : List of PDFMiner text objects. @@ -637,11 +713,13 @@ def get_text_objects(layout, ltype="char", t=None): List of PDFMiner text objects. """ - if ltype == "char": + if ltype == 'char': LTObject = LTChar - elif ltype == "lh": + elif ltype == 'image': + LTObject = LTImage + elif ltype == 'horizontal_text': LTObject = LTTextLineHorizontal - elif ltype == "lv": + elif ltype == 'vertical_text': LTObject = LTTextLineVertical if t is None: t = [] diff --git a/docs/_static/csv/table_regions.csv b/docs/_static/csv/table_regions.csv new file mode 100644 index 0000000..caf534e --- /dev/null +++ b/docs/_static/csv/table_regions.csv @@ -0,0 +1,4 @@ +"Età dell’Assicuratoall’epoca del decesso","Misura % dimaggiorazione" +"18-75","1,00%" +"76-80","0,50%" +"81 in poi","0,10%" diff --git a/docs/_static/pdf/table_regions.pdf b/docs/_static/pdf/table_regions.pdf new file mode 100644 index 0000000..f6f053b Binary files /dev/null and b/docs/_static/pdf/table_regions.pdf differ diff --git a/docs/user/advanced.rst b/docs/user/advanced.rst index ca40bb8..e7b4ab7 100644 --- a/docs/user/advanced.rst +++ b/docs/user/advanced.rst @@ -206,12 +206,10 @@ You can also visualize the textedges found on a page by specifying ``kind='texte Specify table areas ------------------- -In cases such as `these <../_static/pdf/table_areas.pdf>`__, it can be useful to specify table boundaries. You can plot the text on this page and note the top left and bottom right coordinates of the table. +In cases such as `these <../_static/pdf/table_areas.pdf>`__, it can be useful to specify exact table boundaries. You can plot the text on this page and note the top left and bottom right coordinates of the table. Table areas that you want Camelot to analyze can be passed as a list of comma-separated strings to :meth:`read_pdf() `, using the ``table_areas`` keyword argument. -.. _for now: https://github.com/socialcopsdev/camelot/issues/102 - :: >>> tables = camelot.read_pdf('table_areas.pdf', flavor='stream', table_areas=['316,499,566,337']) @@ -226,6 +224,27 @@ Table areas that you want Camelot to analyze can be passed as a list of comma-se .. csv-table:: :file: ../_static/csv/table_areas.csv +Specify table regions +--------------------- + +However there may be cases like `[1] <../_static/pdf/table_regions.pdf>`__ and `[2] `__, where the table might not lie at the exact coordinates every time but in an approximate region. + +You can use the ``table_regions`` keyword argument to :meth:`read_pdf() ` to solve for such cases. When ``table_regions`` is specified, Camelot will only analyze the specified regions to look for tables. + +:: + + >>> tables = camelot.read_pdf('table_regions.pdf', table_regions=['170,370,560,270']) + >>> tables[0].df + +.. tip:: + Here's how you can do the same with the :ref:`command-line interface `. + :: + + $ camelot lattice -R 170,370,560,270 table_regions.pdf + +.. csv-table:: + :file: ../_static/csv/table_regions.csv + Specify column separators ------------------------- @@ -434,11 +453,11 @@ You can pass ``row_tol=<+int>`` to group the rows closer together, as shown belo Detect short lines ------------------ -There might be cases while using :ref:`Lattice ` when smaller lines don't get detected. The size of the smallest line that gets detected is calculated by dividing the PDF page's dimensions with a scaling factor called ``line_size_scaling``. By default, its value is 15. +There might be cases while using :ref:`Lattice ` when smaller lines don't get detected. The size of the smallest line that gets detected is calculated by dividing the PDF page's dimensions with a scaling factor called ``line_scale``. By default, its value is 15. -As you can guess, the larger the ``line_size_scaling``, the smaller the size of lines getting detected. +As you can guess, the larger the ``line_scale``, the smaller the size of lines getting detected. -.. warning:: Making ``line_size_scaling`` very large (>150) will lead to text getting detected as lines. +.. warning:: Making ``line_scale`` very large (>150) will lead to text getting detected as lines. Here's a `PDF <../_static/pdf/short_lines.pdf>`__ where small lines separating the the headers don't get detected with the default value of 15. @@ -458,11 +477,11 @@ Let's plot the table for this PDF. :alt: A plot of the PDF table with short lines :align: left -Clearly, the smaller lines separating the headers, couldn't be detected. Let's try with ``line_size_scaling=40``, and plot the table again. +Clearly, the smaller lines separating the headers, couldn't be detected. Let's try with ``line_scale=40``, and plot the table again. :: - >>> tables = camelot.read_pdf('short_lines.pdf', line_size_scaling=40) + >>> tables = camelot.read_pdf('short_lines.pdf', line_scale=40) >>> camelot.plot(tables[0], kind='grid') >>> plt.show() @@ -511,7 +530,7 @@ We'll use the `PDF <../_static/pdf/short_lines.pdf>`__ from the previous example :: - >>> tables = camelot.read_pdf('short_lines.pdf', line_size_scaling=40, shift_text=['']) + >>> tables = camelot.read_pdf('short_lines.pdf', line_scale=40, shift_text=['']) >>> tables[0].df .. csv-table:: @@ -532,7 +551,7 @@ No surprises there — it did remain in place (observe the strings "2400" and "A :: - >>> tables = camelot.read_pdf('short_lines.pdf', line_size_scaling=40, shift_text=['r', 'b']) + >>> tables = camelot.read_pdf('short_lines.pdf', line_scale=40, shift_text=['r', 'b']) >>> tables[0].df .. tip:: diff --git a/docs/user/quickstart.rst b/docs/user/quickstart.rst index d9d704a..144a302 100644 --- a/docs/user/quickstart.rst +++ b/docs/user/quickstart.rst @@ -14,7 +14,7 @@ Begin by importing the Camelot module:: >>> import camelot -Now, let's try to read a PDF. (You can check out the PDF used in this example `here`_.) Since the PDF has a table with clearly demarcated lines, we will use the :ref:`Lattice ` method here. To do that, we will set the ``mesh`` keyword argument to ``True``. +Now, let's try to read a PDF. (You can check out the PDF used in this example `here`_.) Since the PDF has a table with clearly demarcated lines, we will use the :ref:`Lattice ` method here. .. note:: :ref:`Lattice ` is used by default. You can use :ref:`Stream ` with ``flavor='stream'``. @@ -56,7 +56,7 @@ Woah! The accuracy is top-notch and there is less whitespace, which means the ta .. csv-table:: :file: ../_static/csv/foo.csv -Looks good! You can now export the table as a CSV file using its :meth:`to_csv() ` method. Alternatively you can use :meth:`to_json() `, :meth:`to_excel() ` or :meth:`to_html() ` methods to export the table as JSON, Excel and HTML files respectively. +Looks good! You can now export the table as a CSV file using its :meth:`to_csv() ` method. Alternatively you can use :meth:`to_json() `, :meth:`to_excel() ` :meth:`to_html() ` or :meth:`to_sqlite() ` methods to export the table as JSON, Excel, HTML files or a sqlite database respectively. :: @@ -76,7 +76,7 @@ You can also export all tables at once, using the :class:`tables ` method exports files with a ``page-*-table-*`` suffix. In the example above, the single table in the list will be exported to ``foo-page-1-table-1.csv``. If the list contains multiple tables, multiple CSV files will be created. To avoid filling up your path with multiple files, you can use ``compress=True``, which will create a single ZIP file at your path with all the CSV files. diff --git a/tests/data.py b/tests/data.py index c223227..99527d5 100755 --- a/tests/data.py +++ b/tests/data.py @@ -427,6 +427,13 @@ data_lattice_two_tables_2 = [ ["Pooled", "23889", "47.7", "1.5", "9.9", "19.9", "17.8", "3.3"] ] +data_lattice_table_regions = [ + ['Età dell’Assicurato \nall’epoca del decesso', 'Misura % di \nmaggiorazione'], + ['18-75', '1,00%'], + ['76-80', '0,50%'], + ['81 in poi', '0,10%'] +] + data_lattice_table_areas = [ ["", "", "", "", "", "", "", "", ""], ["State", "n", "Literacy Status", "", "", "", "", "", ""], diff --git a/tests/files/image.pdf b/tests/files/image.pdf new file mode 100644 index 0000000..83f5969 Binary files /dev/null and b/tests/files/image.pdf differ diff --git a/tests/files/table_region.pdf b/tests/files/table_region.pdf new file mode 100644 index 0000000..f6f053b Binary files /dev/null and b/tests/files/table_region.pdf differ diff --git a/tests/test_common.py b/tests/test_common.py index 83c436b..2335060 100644 --- a/tests/test_common.py +++ b/tests/test_common.py @@ -159,6 +159,14 @@ def test_lattice_two_tables(): assert df2.equals(tables[1].df) +def test_lattice_table_regions(): + df = pd.DataFrame(data_lattice_table_regions) + + filename = os.path.join(testdir, "table_region.pdf") + tables = camelot.read_pdf(filename, table_regions=["170,370,560,270"]) + assert df.equals(tables[0].df) + + def test_lattice_table_areas(): df = pd.DataFrame(data_lattice_table_areas) @@ -179,7 +187,7 @@ def test_lattice_copy_text(): df = pd.DataFrame(data_lattice_copy_text) filename = os.path.join(testdir, "row_span_1.pdf") - tables = camelot.read_pdf(filename, line_size_scaling=60, copy_text="v") + tables = camelot.read_pdf(filename, line_scale=60, copy_text="v") assert df.equals(tables[0].df) @@ -189,13 +197,13 @@ def test_lattice_shift_text(): df_rb = pd.DataFrame(data_lattice_shift_text_right_bottom) filename = os.path.join(testdir, "column_span_2.pdf") - tables = camelot.read_pdf(filename, line_size_scaling=40) + tables = camelot.read_pdf(filename, line_scale=40) assert df_lt.equals(tables[0].df) - tables = camelot.read_pdf(filename, line_size_scaling=40, shift_text=['']) + tables = camelot.read_pdf(filename, line_scale=40, shift_text=['']) assert df_disable.equals(tables[0].df) - tables = camelot.read_pdf(filename, line_size_scaling=40, shift_text=['r', 'b']) + tables = camelot.read_pdf(filename, line_scale=40, shift_text=['r', 'b']) assert df_rb.equals(tables[0].df) @@ -207,6 +215,32 @@ def test_repr(): assert repr(tables[0].cells[0][0]) == "" +def test_pages(): + url = "https://camelot-py.readthedocs.io/en/master/_static/pdf/foo.pdf" + tables = camelot.read_pdf(url) + assert repr(tables) == "" + assert repr(tables[0]) == "" + assert repr(tables[0].cells[0][0]) == "" + + tables = camelot.read_pdf(url, pages='1-end') + assert repr(tables) == "" + assert repr(tables[0]) == "
" + assert repr(tables[0].cells[0][0]) == "" + + tables = camelot.read_pdf(url, pages='all') + assert repr(tables) == "" + assert repr(tables[0]) == "
" + assert repr(tables[0].cells[0][0]) == "" + + +def test_url(): + url = "https://camelot-py.readthedocs.io/en/master/_static/pdf/foo.pdf" + tables = camelot.read_pdf(url) + assert repr(tables) == "" + assert repr(tables[0]) == "
" + assert repr(tables[0].cells[0][0]) == "" + + def test_arabic(): df = pd.DataFrame(data_arabic) diff --git a/tests/test_errors.py b/tests/test_errors.py index 49cb4f3..c2e4158 100755 --- a/tests/test_errors.py +++ b/tests/test_errors.py @@ -41,6 +41,15 @@ def test_stream_equal_length(): table_areas=['10,20,30,40'], columns=['10,20,30,40', '10,20,30,40']) +def test_image_warning(): + filename = os.path.join(testdir, 'image.pdf') + with warnings.catch_warnings(): + warnings.simplefilter('error') + with pytest.raises(UserWarning) as e: + tables = camelot.read_pdf(filename) + assert str(e.value) == 'page-1 is image-based, camelot only works on text-based pages.' + + def test_no_tables_found(): filename = os.path.join(testdir, 'blank.pdf') with warnings.catch_warnings():