Merge branch 'master' of github.com:socialcopsdev/camelot into replace-gs-c-api

pull/2/head
Vinayak Mehta 2019-01-05 11:22:38 +05:30
commit ab5391c76f
21 changed files with 375 additions and 112 deletions

View File

@ -6,6 +6,18 @@ master
**Improvements** **Improvements**
* [#240](https://github.com/socialcopsdev/camelot/issues/209) Add support to analyze only certain page regions to look for tables. [#243](https://github.com/socialcopsdev/camelot/pull/243) by Vinayak Mehta.
* You can use `table_regions` in `read_pdf()` to specify approximate page regions which may contain tables.
* Kwarg `line_size_scaling` is now called `line_scale`.
* [#212](https://github.com/socialcopsdev/camelot/issues/212) Add support to export as sqlite database. [#244](https://github.com/socialcopsdev/camelot/pull/244) by Vinayak Mehta.
* [#239](https://github.com/socialcopsdev/camelot/issues/239) Raise warning if PDF is image-based. [#240](https://github.com/socialcopsdev/camelot/pull/240) by Vinayak Mehta.
0.6.0 (2018-12-24)
------------------
**Improvements**
* [#91](https://github.com/socialcopsdev/camelot/issues/91) Add support to read from url. [#236](https://github.com/socialcopsdev/camelot/pull/236) by Vinayak Mehta.
* [#229](https://github.com/socialcopsdev/camelot/issues/229), [#230](https://github.com/socialcopsdev/camelot/issues/230) and [#233](https://github.com/socialcopsdev/camelot/issues/233) New configuration parameters. [#234](https://github.com/socialcopsdev/camelot/pull/234) by Vinayak Mehta. * [#229](https://github.com/socialcopsdev/camelot/issues/229), [#230](https://github.com/socialcopsdev/camelot/issues/230) and [#233](https://github.com/socialcopsdev/camelot/issues/233) New configuration parameters. [#234](https://github.com/socialcopsdev/camelot/pull/234) by Vinayak Mehta.
* `strip_text`: To define characters that should be stripped from each string. * `strip_text`: To define characters that should be stripped from each string.
* `edge_tol`: Tolerance parameter for extending textedges vertically. * `edge_tol`: Tolerance parameter for extending textedges vertically.

View File

@ -21,7 +21,7 @@
>>> tables = camelot.read_pdf('foo.pdf') >>> tables = camelot.read_pdf('foo.pdf')
>>> tables >>> tables
<TableList n=1> <TableList n=1>
>>> tables.export('foo.csv', f='csv', compress=True) # json, excel, html >>> tables.export('foo.csv', f='csv', compress=True) # json, excel, html, sqlite
>>> tables[0] >>> tables[0]
<Table shape=(7, 7)> <Table shape=(7, 7)>
>>> tables[0].parsing_report >>> tables[0].parsing_report
@ -31,7 +31,7 @@
'order': 1, 'order': 1,
'page': 1 'page': 1
} }
>>> tables[0].to_csv('foo.csv') # to_json, to_excel, to_html >>> tables[0].to_csv('foo.csv') # to_json, to_excel, to_html, to_sqlite
>>> tables[0].df # get a pandas DataFrame! >>> tables[0].df # get a pandas DataFrame!
</pre> </pre>
@ -53,7 +53,7 @@ There's a [command-line interface](https://camelot-py.readthedocs.io/en/master/u
- **You are in control.**: Unlike other libraries and tools which either give a nice output or fail miserably (with no in-between), Camelot gives you the power to tweak table extraction. (This is important since everything in the real world, including PDF table extraction, is fuzzy.) - **You are in control.**: Unlike other libraries and tools which either give a nice output or fail miserably (with no in-between), Camelot gives you the power to tweak table extraction. (This is important since everything in the real world, including PDF table extraction, is fuzzy.)
- *Bad* tables can be discarded based on **metrics** like accuracy and whitespace, without ever having to manually look at each table. - *Bad* tables can be discarded based on **metrics** like accuracy and whitespace, without ever having to manually look at each table.
- Each table is a **pandas DataFrame**, which seamlessly integrates into [ETL and data analysis workflows](https://gist.github.com/vinayak-mehta/e5949f7c2410a0e12f25d3682dc9e873). - Each table is a **pandas DataFrame**, which seamlessly integrates into [ETL and data analysis workflows](https://gist.github.com/vinayak-mehta/e5949f7c2410a0e12f25d3682dc9e873).
- **Export** to multiple formats, including JSON, Excel and HTML. - **Export** to multiple formats, including JSON, Excel, HTML and Sqlite.
See [comparison with other PDF table extraction libraries and tools](https://github.com/socialcopsdev/camelot/wiki/Comparison-with-other-PDF-Table-Extraction-libraries-and-tools). See [comparison with other PDF table extraction libraries and tools](https://github.com/socialcopsdev/camelot/wiki/Comparison-with-other-PDF-Table-Extraction-libraries-and-tools).

View File

@ -1,6 +1,6 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
VERSION = (0, 5, 0) VERSION = (0, 6, 0)
PRERELEASE = None # alpha, beta or rc PRERELEASE = None # alpha, beta or rc
REVISION = None REVISION = None

View File

@ -32,11 +32,11 @@ pass_config = click.make_pass_decorator(Config)
@click.version_option(version=__version__) @click.version_option(version=__version__)
@click.option('-q', '--quiet', is_flag=False, help='Suppress logs and warnings.') @click.option('-q', '--quiet', is_flag=False, help='Suppress logs and warnings.')
@click.option('-p', '--pages', default='1', help='Comma-separated page numbers.' @click.option('-p', '--pages', default='1', help='Comma-separated page numbers.'
' Example: 1,3,4 or 1,4-end.') ' Example: 1,3,4 or 1,4-end or all.')
@click.option('-pw', '--password', help='Password for decryption.') @click.option('-pw', '--password', help='Password for decryption.')
@click.option('-o', '--output', help='Output file path.') @click.option('-o', '--output', help='Output file path.')
@click.option('-f', '--format', @click.option('-f', '--format',
type=click.Choice(['csv', 'json', 'excel', 'html']), type=click.Choice(['csv', 'json', 'excel', 'html', 'sqlite']),
help='Output file format.') help='Output file format.')
@click.option('-z', '--zip', is_flag=True, help='Create ZIP archive.') @click.option('-z', '--zip', is_flag=True, help='Create ZIP archive.')
@click.option('-split', '--split_text', is_flag=True, @click.option('-split', '--split_text', is_flag=True,
@ -56,12 +56,15 @@ def cli(ctx, *args, **kwargs):
@cli.command('lattice') @cli.command('lattice')
@click.option('-R', '--table_regions', default=[], multiple=True,
help='Page regions to analyze. Example: x1,y1,x2,y2'
' where x1, y1 -> left-top and x2, y2 -> right-bottom.')
@click.option('-T', '--table_areas', default=[], multiple=True, @click.option('-T', '--table_areas', default=[], multiple=True,
help='Table areas to process. Example: x1,y1,x2,y2' help='Table areas to process. Example: x1,y1,x2,y2'
' where x1, y1 -> left-top and x2, y2 -> right-bottom.') ' where x1, y1 -> left-top and x2, y2 -> right-bottom.')
@click.option('-back', '--process_background', is_flag=True, @click.option('-back', '--process_background', is_flag=True,
help='Process background lines.') help='Process background lines.')
@click.option('-scale', '--line_size_scaling', default=15, @click.option('-scale', '--line_scale', default=15,
help='Line size scaling factor. The larger the value,' help='Line size scaling factor. The larger the value,'
' the smaller the detected lines.') ' the smaller the detected lines.')
@click.option('-copy', '--copy_text', default=[], type=click.Choice(['h', 'v']), @click.option('-copy', '--copy_text', default=[], type=click.Choice(['h', 'v']),
@ -105,6 +108,8 @@ def lattice(c, *args, **kwargs):
filepath = kwargs.pop('filepath') filepath = kwargs.pop('filepath')
kwargs.update(conf) kwargs.update(conf)
table_regions = list(kwargs['table_regions'])
kwargs['table_regions'] = None if not table_regions else table_regions
table_areas = list(kwargs['table_areas']) table_areas = list(kwargs['table_areas'])
kwargs['table_areas'] = None if not table_areas else table_areas kwargs['table_areas'] = None if not table_areas else table_areas
copy_text = list(kwargs['copy_text']) copy_text = list(kwargs['copy_text'])
@ -132,6 +137,9 @@ def lattice(c, *args, **kwargs):
@cli.command('stream') @cli.command('stream')
@click.option('-R', '--table_regions', default=[], multiple=True,
help='Page regions to analyze. Example: x1,y1,x2,y2'
' where x1, y1 -> left-top and x2, y2 -> right-bottom.')
@click.option('-T', '--table_areas', default=[], multiple=True, @click.option('-T', '--table_areas', default=[], multiple=True,
help='Table areas to process. Example: x1,y1,x2,y2' help='Table areas to process. Example: x1,y1,x2,y2'
' where x1, y1 -> left-top and x2, y2 -> right-bottom.') ' where x1, y1 -> left-top and x2, y2 -> right-bottom.')
@ -160,6 +168,8 @@ def stream(c, *args, **kwargs):
filepath = kwargs.pop('filepath') filepath = kwargs.pop('filepath')
kwargs.update(conf) kwargs.update(conf)
table_regions = list(kwargs['table_regions'])
kwargs['table_regions'] = None if not table_regions else table_regions
table_areas = list(kwargs['table_areas']) table_areas = list(kwargs['table_areas'])
kwargs['table_areas'] = None if not table_areas else table_areas kwargs['table_areas'] = None if not table_areas else table_areas
columns = list(kwargs['columns']) columns = list(kwargs['columns'])

View File

@ -1,6 +1,7 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
import os import os
import sqlite3
import zipfile import zipfile
import tempfile import tempfile
from itertools import chain from itertools import chain
@ -592,6 +593,28 @@ class Table(object):
with open(path, 'w') as f: with open(path, 'w') as f:
f.write(html_string) f.write(html_string)
def to_sqlite(self, path, **kwargs):
"""Writes Table to sqlite database.
For kwargs, check :meth:`pandas.DataFrame.to_sql`.
Parameters
----------
path : str
Output filepath.
"""
kw = {
'if_exists': 'replace',
'index': False
}
kw.update(kwargs)
conn = sqlite3.connect(path)
table_name = 'page-{}-table-{}'.format(self.page, self.order)
self.df.to_sql(table_name, conn, **kw)
conn.commit()
conn.close()
class TableList(object): class TableList(object):
"""Defines a list of camelot.core.Table objects. Each table can """Defines a list of camelot.core.Table objects. Each table can
@ -656,7 +679,7 @@ class TableList(object):
path : str path : str
Output filepath. Output filepath.
f : str f : str
File format. Can be csv, json, excel and html. File format. Can be csv, json, excel, html and sqlite.
compress : bool compress : bool
Whether or not to add files to a ZIP archive. Whether or not to add files to a ZIP archive.
@ -689,3 +712,11 @@ class TableList(object):
zipname = os.path.join(os.path.dirname(path), root) + '.zip' zipname = os.path.join(os.path.dirname(path), root) + '.zip'
with zipfile.ZipFile(zipname, 'w', allowZip64=True) as z: with zipfile.ZipFile(zipname, 'w', allowZip64=True) as z:
z.write(filepath, os.path.basename(filepath)) z.write(filepath, os.path.basename(filepath))
elif f == 'sqlite':
filepath = os.path.join(dirname, basename)
for table in self._tables:
table.to_sqlite(filepath)
if compress:
zipname = os.path.join(os.path.dirname(path), root) + '.zip'
with zipfile.ZipFile(zipname, 'w', allowZip64=True) as z:
z.write(filepath, os.path.basename(filepath))

View File

@ -8,7 +8,7 @@ from PyPDF2 import PdfFileReader, PdfFileWriter
from .core import TableList from .core import TableList
from .parsers import Stream, Lattice from .parsers import Stream, Lattice
from .utils import (TemporaryDirectory, get_page_layout, get_text_objects, from .utils import (TemporaryDirectory, get_page_layout, get_text_objects,
get_rotation) get_rotation, is_url, download_url)
class PDFHandler(object): class PDFHandler(object):
@ -18,20 +18,22 @@ class PDFHandler(object):
Parameters Parameters
---------- ----------
filename : str filepath : str
Path to PDF file. Filepath or URL of the PDF file.
pages : str, optional (default: '1') pages : str, optional (default: '1')
Comma-separated page numbers. Comma-separated page numbers.
Example: '1,3,4' or '1,4-end'. Example: '1,3,4' or '1,4-end' or 'all'.
password : str, optional (default: None) password : str, optional (default: None)
Password for decryption. Password for decryption.
""" """
def __init__(self, filename, pages='1', password=None): def __init__(self, filepath, pages='1', password=None):
self.filename = filename if is_url(filepath):
if not filename.lower().endswith('.pdf'): filepath = download_url(filepath)
self.filepath = filepath
if not filepath.lower().endswith('.pdf'):
raise NotImplementedError("File format not supported") raise NotImplementedError("File format not supported")
self.pages = self._get_pages(self.filename, pages) self.pages = self._get_pages(self.filepath, pages)
if password is None: if password is None:
self.password = '' self.password = ''
else: else:
@ -39,16 +41,16 @@ class PDFHandler(object):
if sys.version_info[0] < 3: if sys.version_info[0] < 3:
self.password = self.password.encode('ascii') self.password = self.password.encode('ascii')
def _get_pages(self, filename, pages): def _get_pages(self, filepath, pages):
"""Converts pages string to list of ints. """Converts pages string to list of ints.
Parameters Parameters
---------- ----------
filename : str filepath : str
Path to PDF file. Filepath or URL of the PDF file.
pages : str, optional (default: '1') pages : str, optional (default: '1')
Comma-separated page numbers. Comma-separated page numbers.
Example: 1,3,4 or 1,4-end. Example: '1,3,4' or '1,4-end' or 'all'.
Returns Returns
------- -------
@ -60,7 +62,7 @@ class PDFHandler(object):
if pages == '1': if pages == '1':
page_numbers.append({'start': 1, 'end': 1}) page_numbers.append({'start': 1, 'end': 1})
else: else:
infile = PdfFileReader(open(filename, 'rb'), strict=False) infile = PdfFileReader(open(filepath, 'rb'), strict=False)
if infile.isEncrypted: if infile.isEncrypted:
infile.decrypt(self.password) infile.decrypt(self.password)
if pages == 'all': if pages == 'all':
@ -79,20 +81,20 @@ class PDFHandler(object):
P.extend(range(p['start'], p['end'] + 1)) P.extend(range(p['start'], p['end'] + 1))
return sorted(set(P)) return sorted(set(P))
def _save_page(self, filename, page, temp): def _save_page(self, filepath, page, temp):
"""Saves specified page from PDF into a temporary directory. """Saves specified page from PDF into a temporary directory.
Parameters Parameters
---------- ----------
filename : str filepath : str
Path to PDF file. Filepath or URL of the PDF file.
page : int page : int
Page number. Page number.
temp : str temp : str
Tmp directory. Tmp directory.
""" """
with open(filename, 'rb') as fileobj: with open(filepath, 'rb') as fileobj:
infile = PdfFileReader(fileobj, strict=False) infile = PdfFileReader(fileobj, strict=False)
if infile.isEncrypted: if infile.isEncrypted:
infile.decrypt(self.password) infile.decrypt(self.password)
@ -105,10 +107,10 @@ class PDFHandler(object):
outfile.write(f) outfile.write(f)
layout, dim = get_page_layout(fpath) layout, dim = get_page_layout(fpath)
# fix rotated PDF # fix rotated PDF
lttextlh = get_text_objects(layout, ltype="lh") chars = get_text_objects(layout, ltype="char")
lttextlv = get_text_objects(layout, ltype="lv") horizontal_text = get_text_objects(layout, ltype="horizontal_text")
ltchar = get_text_objects(layout, ltype="char") vertical_text = get_text_objects(layout, ltype="vertical_text")
rotation = get_rotation(lttextlh, lttextlv, ltchar) rotation = get_rotation(chars, horizontal_text, vertical_text)
if rotation != '': if rotation != '':
fpath_new = ''.join([froot.replace('page', 'p'), '_rotated', fext]) fpath_new = ''.join([froot.replace('page', 'p'), '_rotated', fext])
os.rename(fpath, fpath_new) os.rename(fpath, fpath_new)
@ -150,7 +152,7 @@ class PDFHandler(object):
tables = [] tables = []
with TemporaryDirectory() as tempdir: with TemporaryDirectory() as tempdir:
for p in self.pages: for p in self.pages:
self._save_page(self.filename, p, tempdir) self._save_page(self.filepath, p, tempdir)
pages = [os.path.join(tempdir, 'page-{0}.pdf'.format(p)) pages = [os.path.join(tempdir, 'page-{0}.pdf'.format(p))
for p in self.pages] for p in self.pages]
parser = Lattice(**kwargs) if flavor == 'lattice' else Stream(**kwargs) parser = Lattice(**kwargs) if flavor == 'lattice' else Stream(**kwargs)

View File

@ -48,7 +48,8 @@ def adaptive_threshold(imagename, process_background=False, blocksize=15, c=-2):
return img, threshold return img, threshold
def find_lines(threshold, direction='horizontal', line_size_scaling=15, iterations=0): def find_lines(threshold, regions=None, direction='horizontal',
line_scale=15, iterations=0):
"""Finds horizontal and vertical lines by applying morphological """Finds horizontal and vertical lines by applying morphological
transformations on an image. transformations on an image.
@ -56,9 +57,13 @@ def find_lines(threshold, direction='horizontal', line_size_scaling=15, iteratio
---------- ----------
threshold : object threshold : object
numpy.ndarray representing the thresholded image. numpy.ndarray representing the thresholded image.
regions : list, optional (default: None)
List of page regions that may contain tables of the form x1,y1,x2,y2
where (x1, y1) -> left-top and (x2, y2) -> right-bottom
in image coordinate space.
direction : string, optional (default: 'horizontal') direction : string, optional (default: 'horizontal')
Specifies whether to find vertical or horizontal lines. Specifies whether to find vertical or horizontal lines.
line_size_scaling : int, optional (default: 15) line_scale : int, optional (default: 15)
Factor by which the page dimensions will be divided to get Factor by which the page dimensions will be divided to get
smallest length of lines that should be detected. smallest length of lines that should be detected.
@ -83,26 +88,33 @@ def find_lines(threshold, direction='horizontal', line_size_scaling=15, iteratio
lines = [] lines = []
if direction == 'vertical': if direction == 'vertical':
size = threshold.shape[0] // line_size_scaling size = threshold.shape[0] // line_scale
el = cv2.getStructuringElement(cv2.MORPH_RECT, (1, size)) el = cv2.getStructuringElement(cv2.MORPH_RECT, (1, size))
elif direction == 'horizontal': elif direction == 'horizontal':
size = threshold.shape[1] // line_size_scaling size = threshold.shape[1] // line_scale
el = cv2.getStructuringElement(cv2.MORPH_RECT, (size, 1)) el = cv2.getStructuringElement(cv2.MORPH_RECT, (size, 1))
elif direction is None: elif direction is None:
raise ValueError("Specify direction as either 'vertical' or" raise ValueError("Specify direction as either 'vertical' or"
" 'horizontal'") " 'horizontal'")
if regions is not None:
region_mask = np.zeros(threshold.shape)
for region in regions:
x, y, w, h = region
region_mask[y : y + h, x : x + w] = 1
threshold = np.multiply(threshold, region_mask)
threshold = cv2.erode(threshold, el) threshold = cv2.erode(threshold, el)
threshold = cv2.dilate(threshold, el) threshold = cv2.dilate(threshold, el)
dmask = cv2.dilate(threshold, el, iterations=iterations) dmask = cv2.dilate(threshold, el, iterations=iterations)
try: try:
_, contours, _ = cv2.findContours( _, contours, _ = cv2.findContours(
threshold, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) threshold.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
except ValueError: except ValueError:
# for opencv backward compatibility # for opencv backward compatibility
contours, _ = cv2.findContours( contours, _ = cv2.findContours(
threshold, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) threshold.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
for c in contours: for c in contours:
x, y, w, h = cv2.boundingRect(c) x, y, w, h = cv2.boundingRect(c)
@ -116,7 +128,7 @@ def find_lines(threshold, direction='horizontal', line_size_scaling=15, iteratio
return dmask, lines return dmask, lines
def find_table_contours(vertical, horizontal): def find_contours(vertical, horizontal):
"""Finds table boundaries using OpenCV's findContours. """Finds table boundaries using OpenCV's findContours.
Parameters Parameters
@ -138,11 +150,12 @@ def find_table_contours(vertical, horizontal):
try: try:
__, contours, __ = cv2.findContours( __, contours, __ = cv2.findContours(
mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) mask.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
except ValueError: except ValueError:
# for opencv backward compatibility # for opencv backward compatibility
contours, __ = cv2.findContours( contours, __ = cv2.findContours(
mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) mask.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
# sort in reverse based on contour area and use first 10 contours
contours = sorted(contours, key=cv2.contourArea, reverse=True)[:10] contours = sorted(contours, key=cv2.contourArea, reverse=True)[:10]
cont = [] cont = []
@ -153,7 +166,7 @@ def find_table_contours(vertical, horizontal):
return cont return cont
def find_table_joints(contours, vertical, horizontal): def find_joints(contours, vertical, horizontal):
"""Finds joints/intersections present inside each table boundary. """Finds joints/intersections present inside each table boundary.
Parameters Parameters
@ -176,18 +189,18 @@ def find_table_joints(contours, vertical, horizontal):
and (x2, y2) -> rt in image coordinate space. and (x2, y2) -> rt in image coordinate space.
""" """
joints = np.bitwise_and(vertical, horizontal) joints = np.multiply(vertical, horizontal)
tables = {} tables = {}
for c in contours: for c in contours:
x, y, w, h = c x, y, w, h = c
roi = joints[y : y + h, x : x + w] roi = joints[y : y + h, x : x + w]
try: try:
__, jc, __ = cv2.findContours( __, jc, __ = cv2.findContours(
roi, cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE) roi.astype(np.uint8), cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE)
except ValueError: except ValueError:
# for opencv backward compatibility # for opencv backward compatibility
jc, __ = cv2.findContours( jc, __ = cv2.findContours(
roi, cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE) roi.astype(np.uint8), cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE)
if len(jc) <= 4: # remove contours with less than 4 joints if len(jc) <= 4: # remove contours with less than 4 joints
continue continue
joint_coords = [] joint_coords = []

View File

@ -1,4 +1,5 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
import warnings import warnings
from .handlers import PDFHandler from .handlers import PDFHandler
@ -15,10 +16,10 @@ def read_pdf(filepath, pages='1', password=None, flavor='lattice',
Parameters Parameters
---------- ----------
filepath : str filepath : str
Path to PDF file. Filepath or URL of the PDF file.
pages : str, optional (default: '1') pages : str, optional (default: '1')
Comma-separated page numbers. Comma-separated page numbers.
Example: '1,3,4' or '1,4-end'. Example: '1,3,4' or '1,4-end' or 'all'.
password : str, optional (default: None) password : str, optional (default: None)
Password for decryption. Password for decryption.
flavor : str (default: 'lattice') flavor : str (default: 'lattice')
@ -51,7 +52,7 @@ def read_pdf(filepath, pages='1', password=None, flavor='lattice',
to generate columns. to generate columns.
process_background* : bool, optional (default: False) process_background* : bool, optional (default: False)
Process background lines. Process background lines.
line_size_scaling* : int, optional (default: 15) line_scale* : int, optional (default: 15)
Line size scaling factor. The larger the value the smaller Line size scaling factor. The larger the value the smaller
the detected lines. Making it very large will lead to text the detected lines. Making it very large will lead to text
being detected as lines. being detected as lines.

View File

@ -13,7 +13,8 @@ class BaseParser(object):
self.layout_kwargs = layout_kwargs self.layout_kwargs = layout_kwargs
self.layout, self.dimensions = get_page_layout( self.layout, self.dimensions = get_page_layout(
filename, **layout_kwargs) filename, **layout_kwargs)
self.horizontal_text = get_text_objects(self.layout, ltype="lh") self.images = get_text_objects(self.layout, ltype='image')
self.vertical_text = get_text_objects(self.layout, ltype="lv") self.horizontal_text = get_text_objects(self.layout, ltype='horizontal_text')
self.vertical_text = get_text_objects(self.layout, ltype='vertical_text')
self.pdf_width, self.pdf_height = self.dimensions self.pdf_width, self.pdf_height = self.dimensions
self.rootname, __ = os.path.splitext(self.filename) self.rootname, __ = os.path.splitext(self.filename)

View File

@ -19,7 +19,7 @@ from ..utils import (scale_image, scale_pdf, segments_in_bbox, text_in_bbox,
merge_close_lines, get_table_index, compute_accuracy, merge_close_lines, get_table_index, compute_accuracy,
compute_whitespace) compute_whitespace)
from ..image_processing import (adaptive_threshold, find_lines, from ..image_processing import (adaptive_threshold, find_lines,
find_table_contours, find_table_joints) find_contours, find_joints)
logger = logging.getLogger('camelot') logger = logging.getLogger('camelot')
@ -31,13 +31,17 @@ class Lattice(BaseParser):
Parameters Parameters
---------- ----------
table_regions : list, optional (default: None)
List of page regions that may contain tables of the form x1,y1,x2,y2
where (x1, y1) -> left-top and (x2, y2) -> right-bottom
in PDF coordinate space.
table_areas : list, optional (default: None) table_areas : list, optional (default: None)
List of table area strings of the form x1,y1,x2,y2 List of table area strings of the form x1,y1,x2,y2
where (x1, y1) -> left-top and (x2, y2) -> right-bottom where (x1, y1) -> left-top and (x2, y2) -> right-bottom
in PDF coordinate space. in PDF coordinate space.
process_background : bool, optional (default: False) process_background : bool, optional (default: False)
Process background lines. Process background lines.
line_size_scaling : int, optional (default: 15) line_scale : int, optional (default: 15)
Line size scaling factor. The larger the value the smaller Line size scaling factor. The larger the value the smaller
the detected lines. Making it very large will lead to text the detected lines. Making it very large will lead to text
being detected as lines. being detected as lines.
@ -80,14 +84,15 @@ class Lattice(BaseParser):
Resolution used for PDF to PNG conversion. Resolution used for PDF to PNG conversion.
""" """
def __init__(self, table_areas=None, process_background=False, def __init__(self, table_regions=None, table_areas=None, process_background=False,
line_size_scaling=15, copy_text=None, shift_text=['l', 't'], line_scale=15, copy_text=None, shift_text=['l', 't'],
split_text=False, flag_size=False, strip_text='', line_tol=2, split_text=False, flag_size=False, strip_text='', line_tol=2,
joint_tol=2, threshold_blocksize=15, threshold_constant=-2, joint_tol=2, threshold_blocksize=15, threshold_constant=-2,
iterations=0, resolution=300, **kwargs): iterations=0, resolution=300, **kwargs):
self.table_regions = table_regions
self.table_areas = table_areas self.table_areas = table_areas
self.process_background = process_background self.process_background = process_background
self.line_size_scaling = line_size_scaling self.line_scale = line_scale
self.copy_text = copy_text self.copy_text = copy_text
self.shift_text = shift_text self.shift_text = shift_text
self.split_text = split_text self.split_text = split_text
@ -189,9 +194,22 @@ class Lattice(BaseParser):
null.close() null.close()
def _generate_table_bbox(self): def _generate_table_bbox(self):
def scale_areas(areas):
scaled_areas = []
for area in areas:
x1, y1, x2, y2 = area.split(",")
x1 = float(x1)
y1 = float(y1)
x2 = float(x2)
y2 = float(y2)
x1, y1, x2, y2 = scale_pdf((x1, y1, x2, y2), image_scalers)
scaled_areas.append((x1, y1, abs(x2 - x1), abs(y2 - y1)))
return scaled_areas
self.image, self.threshold = adaptive_threshold( self.image, self.threshold = adaptive_threshold(
self.imagename, process_background=self.process_background, self.imagename, process_background=self.process_background,
blocksize=self.threshold_blocksize, c=self.threshold_constant) blocksize=self.threshold_blocksize, c=self.threshold_constant)
image_width = self.image.shape[1] image_width = self.image.shape[1]
image_height = self.image.shape[0] image_height = self.image.shape[0]
image_width_scaler = image_width / float(self.pdf_width) image_width_scaler = image_width / float(self.pdf_width)
@ -201,27 +219,30 @@ class Lattice(BaseParser):
image_scalers = (image_width_scaler, image_height_scaler, self.pdf_height) image_scalers = (image_width_scaler, image_height_scaler, self.pdf_height)
pdf_scalers = (pdf_width_scaler, pdf_height_scaler, image_height) pdf_scalers = (pdf_width_scaler, pdf_height_scaler, image_height)
vertical_mask, vertical_segments = find_lines( if self.table_areas is None:
self.threshold, direction='vertical', regions = None
line_size_scaling=self.line_size_scaling, iterations=self.iterations) if self.table_regions is not None:
horizontal_mask, horizontal_segments = find_lines( regions = scale_areas(self.table_regions)
self.threshold, direction='horizontal',
line_size_scaling=self.line_size_scaling, iterations=self.iterations)
if self.table_areas is not None: vertical_mask, vertical_segments = find_lines(
areas = [] self.threshold, regions=regions, direction='vertical',
for area in self.table_areas: line_scale=self.line_scale, iterations=self.iterations)
x1, y1, x2, y2 = area.split(",") horizontal_mask, horizontal_segments = find_lines(
x1 = float(x1) self.threshold, regions=regions, direction='horizontal',
y1 = float(y1) line_scale=self.line_scale, iterations=self.iterations)
x2 = float(x2)
y2 = float(y2) contours = find_contours(vertical_mask, horizontal_mask)
x1, y1, x2, y2 = scale_pdf((x1, y1, x2, y2), image_scalers) table_bbox = find_joints(contours, vertical_mask, horizontal_mask)
areas.append((x1, y1, abs(x2 - x1), abs(y2 - y1)))
table_bbox = find_table_joints(areas, vertical_mask, horizontal_mask)
else: else:
contours = find_table_contours(vertical_mask, horizontal_mask) vertical_mask, vertical_segments = find_lines(
table_bbox = find_table_joints(contours, vertical_mask, horizontal_mask) self.threshold, direction='vertical', line_scale=self.line_scale,
iterations=self.iterations)
horizontal_mask, horizontal_segments = find_lines(
self.threshold, direction='horizontal', line_scale=self.line_scale,
iterations=self.iterations)
areas = scale_areas(self.table_areas)
table_bbox = find_joints(areas, vertical_mask, horizontal_mask)
self.table_bbox_unscaled = copy.deepcopy(table_bbox) self.table_bbox_unscaled = copy.deepcopy(table_bbox)
@ -318,7 +339,11 @@ class Lattice(BaseParser):
logger.info('Processing {}'.format(os.path.basename(self.rootname))) logger.info('Processing {}'.format(os.path.basename(self.rootname)))
if not self.horizontal_text: if not self.horizontal_text:
warnings.warn("No tables found on {}".format( if self.images:
warnings.warn('{} is image-based, camelot only works on'
' text-based pages.'.format(os.path.basename(self.rootname)))
else:
warnings.warn('No tables found on {}'.format(
os.path.basename(self.rootname))) os.path.basename(self.rootname)))
return [] return []

View File

@ -26,6 +26,10 @@ class Stream(BaseParser):
Parameters Parameters
---------- ----------
table_regions : list, optional (default: None)
List of page regions that may contain tables of the form x1,y1,x2,y2
where (x1, y1) -> left-top and (x2, y2) -> right-bottom
in PDF coordinate space.
table_areas : list, optional (default: None) table_areas : list, optional (default: None)
List of table area strings of the form x1,y1,x2,y2 List of table area strings of the form x1,y1,x2,y2
where (x1, y1) -> left-top and (x2, y2) -> right-bottom where (x1, y1) -> left-top and (x2, y2) -> right-bottom
@ -51,9 +55,10 @@ class Stream(BaseParser):
to generate columns. to generate columns.
""" """
def __init__(self, table_areas=None, columns=None, split_text=False, def __init__(self, table_regions=None, table_areas=None, columns=None, split_text=False,
flag_size=False, strip_text='', edge_tol=50, row_tol=2, flag_size=False, strip_text='', edge_tol=50, row_tol=2,
column_tol=0, **kwargs): column_tol=0, **kwargs):
self.table_regions = table_regions
self.table_areas = table_areas self.table_areas = table_areas
self.columns = columns self.columns = columns
self._validate_columns() self._validate_columns()
@ -275,7 +280,18 @@ class Stream(BaseParser):
def _generate_table_bbox(self): def _generate_table_bbox(self):
self.textedges = [] self.textedges = []
if self.table_areas is not None: if self.table_areas is None:
hor_text = self.horizontal_text
if self.table_regions is not None:
# filter horizontal text
hor_text = []
for region in self.table_regions:
x1, y1, x2, y2 = region
region_text = text_in_bbox((x1, y2, x2, y1), self.horizontal_text)
hor_text.extend(region_text)
# find tables based on nurminen's detection algorithm
table_bbox = self._nurminen_table_detection(hor_text)
else:
table_bbox = {} table_bbox = {}
for area in self.table_areas: for area in self.table_areas:
x1, y1, x2, y2 = area.split(",") x1, y1, x2, y2 = area.split(",")
@ -284,9 +300,6 @@ class Stream(BaseParser):
x2 = float(x2) x2 = float(x2)
y2 = float(y2) y2 = float(y2)
table_bbox[(x1, y2, x2, y1)] = None table_bbox[(x1, y2, x2, y1)] = None
else:
# find tables based on nurminen's detection algorithm
table_bbox = self._nurminen_table_detection(self.horizontal_text)
self.table_bbox = table_bbox self.table_bbox = table_bbox
def _generate_columns_and_rows(self, table_idx, tk): def _generate_columns_and_rows(self, table_idx, tk):
@ -395,7 +408,11 @@ class Stream(BaseParser):
logger.info('Processing {}'.format(os.path.basename(self.rootname))) logger.info('Processing {}'.format(os.path.basename(self.rootname)))
if not self.horizontal_text: if not self.horizontal_text:
warnings.warn("No tables found on {}".format( if self.images:
warnings.warn('{} is image-based, camelot only works on'
' text-based pages.'.format(os.path.basename(self.rootname)))
else:
warnings.warn('No tables found on {}'.format(
os.path.basename(self.rootname))) os.path.basename(self.rootname)))
return [] return []

View File

@ -1,12 +1,17 @@
# -*- coding: utf-8 -*-
from __future__ import division from __future__ import division
import os
import sys
import random
import shutil import shutil
import string
import tempfile import tempfile
import warnings import warnings
from itertools import groupby from itertools import groupby
from operator import itemgetter from operator import itemgetter
import numpy as np import numpy as np
from pdfminer.pdfparser import PDFParser from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage from pdfminer.pdfpage import PDFPage
@ -15,7 +20,78 @@ from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfinterp import PDFPageInterpreter from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.converter import PDFPageAggregator from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import (LAParams, LTAnno, LTChar, LTTextLineHorizontal, from pdfminer.layout import (LAParams, LTAnno, LTChar, LTTextLineHorizontal,
LTTextLineVertical) LTTextLineVertical, LTImage)
PY3 = sys.version_info[0] >= 3
if PY3:
from urllib.request import urlopen
from urllib.parse import urlparse as parse_url
from urllib.parse import uses_relative, uses_netloc, uses_params
else:
from urllib2 import urlopen
from urlparse import urlparse as parse_url
from urlparse import uses_relative, uses_netloc, uses_params
_VALID_URLS = set(uses_relative + uses_netloc + uses_params)
_VALID_URLS.discard('')
# https://github.com/pandas-dev/pandas/blob/master/pandas/io/common.py
def is_url(url):
"""Check to see if a URL has a valid protocol.
Parameters
----------
url : str or unicode
Returns
-------
isurl : bool
If url has a valid protocol return True otherwise False.
"""
try:
return parse_url(url).scheme in _VALID_URLS
except Exception:
return False
def random_string(length):
ret = ''
while length:
ret += random.choice(string.digits + string.ascii_lowercase + string.ascii_uppercase)
length -= 1
return ret
def download_url(url):
"""Download file from specified URL.
Parameters
----------
url : str or unicode
Returns
-------
filepath : str or unicode
Temporary filepath.
"""
filename = '{}.pdf'.format(random_string(6))
with tempfile.NamedTemporaryFile('wb', delete=False) as f:
obj = urlopen(url)
if PY3:
content_type = obj.info().get_content_type()
else:
content_type = obj.info().getheader('Content-Type')
if content_type != 'application/pdf':
raise NotImplementedError("File format not supported")
f.write(obj.read())
filepath = os.path.join(os.path.dirname(f.name), filename)
shutil.move(f.name, filepath)
return filepath
stream_kwargs = [ stream_kwargs = [
@ -25,7 +101,7 @@ stream_kwargs = [
] ]
lattice_kwargs = [ lattice_kwargs = [
'process_background', 'process_background',
'line_size_scaling', 'line_scale',
'copy_text', 'copy_text',
'shift_text', 'shift_text',
'line_tol', 'line_tol',
@ -194,15 +270,15 @@ def scale_image(tables, v_segments, h_segments, factors):
return tables_new, v_segments_new, h_segments_new return tables_new, v_segments_new, h_segments_new
def get_rotation(lttextlh, lttextlv, ltchar): def get_rotation(chars, horizontal_text, vertical_text):
"""Detects if text in table is rotated or not using the current """Detects if text in table is rotated or not using the current
transformation matrix (CTM) and returns its orientation. transformation matrix (CTM) and returns its orientation.
Parameters Parameters
---------- ----------
lttextlh : list horizontal_text : list
List of PDFMiner LTTextLineHorizontal objects. List of PDFMiner LTTextLineHorizontal objects.
lttextlv : list vertical_text : list
List of PDFMiner LTTextLineVertical objects. List of PDFMiner LTTextLineVertical objects.
ltchar : list ltchar : list
List of PDFMiner LTChar objects. List of PDFMiner LTChar objects.
@ -216,11 +292,11 @@ def get_rotation(lttextlh, lttextlv, ltchar):
""" """
rotation = '' rotation = ''
hlen = len([t for t in lttextlh if t.get_text().strip()]) hlen = len([t for t in horizontal_text if t.get_text().strip()])
vlen = len([t for t in lttextlv if t.get_text().strip()]) vlen = len([t for t in vertical_text if t.get_text().strip()])
if hlen < vlen: if hlen < vlen:
clockwise = sum(t.matrix[1] < 0 and t.matrix[2] > 0 for t in ltchar) clockwise = sum(t.matrix[1] < 0 and t.matrix[2] > 0 for t in chars)
anticlockwise = sum(t.matrix[1] > 0 and t.matrix[2] < 0 for t in ltchar) anticlockwise = sum(t.matrix[1] > 0 and t.matrix[2] < 0 for t in chars)
rotation = 'anticlockwise' if clockwise < anticlockwise else 'clockwise' rotation = 'anticlockwise' if clockwise < anticlockwise else 'clockwise'
return rotation return rotation
@ -263,7 +339,7 @@ def text_in_bbox(bbox, text):
---------- ----------
bbox : tuple bbox : tuple
Tuple (x1, y1, x2, y2) representing a bounding box where Tuple (x1, y1, x2, y2) representing a bounding box where
(x1, y1) -> lb and (x2, y2) -> rt in PDFMiner coordinate (x1, y1) -> lb and (x2, y2) -> rt in the PDF coordinate
space. space.
text : List of PDFMiner text objects. text : List of PDFMiner text objects.
@ -637,11 +713,13 @@ def get_text_objects(layout, ltype="char", t=None):
List of PDFMiner text objects. List of PDFMiner text objects.
""" """
if ltype == "char": if ltype == 'char':
LTObject = LTChar LTObject = LTChar
elif ltype == "lh": elif ltype == 'image':
LTObject = LTImage
elif ltype == 'horizontal_text':
LTObject = LTTextLineHorizontal LTObject = LTTextLineHorizontal
elif ltype == "lv": elif ltype == 'vertical_text':
LTObject = LTTextLineVertical LTObject = LTTextLineVertical
if t is None: if t is None:
t = [] t = []

View File

@ -0,0 +1,4 @@
"Età dellAssicuratoallepoca del decesso","Misura % dimaggiorazione"
"18-75","1,00%"
"76-80","0,50%"
"81 in poi","0,10%"
1 Età dell’Assicuratoall’epoca del decesso Misura % dimaggiorazione
2 18-75 1,00%
3 76-80 0,50%
4 81 in poi 0,10%

Binary file not shown.

View File

@ -206,12 +206,10 @@ You can also visualize the textedges found on a page by specifying ``kind='texte
Specify table areas Specify table areas
------------------- -------------------
In cases such as `these <../_static/pdf/table_areas.pdf>`__, it can be useful to specify table boundaries. You can plot the text on this page and note the top left and bottom right coordinates of the table. In cases such as `these <../_static/pdf/table_areas.pdf>`__, it can be useful to specify exact table boundaries. You can plot the text on this page and note the top left and bottom right coordinates of the table.
Table areas that you want Camelot to analyze can be passed as a list of comma-separated strings to :meth:`read_pdf() <camelot.read_pdf>`, using the ``table_areas`` keyword argument. Table areas that you want Camelot to analyze can be passed as a list of comma-separated strings to :meth:`read_pdf() <camelot.read_pdf>`, using the ``table_areas`` keyword argument.
.. _for now: https://github.com/socialcopsdev/camelot/issues/102
:: ::
>>> tables = camelot.read_pdf('table_areas.pdf', flavor='stream', table_areas=['316,499,566,337']) >>> tables = camelot.read_pdf('table_areas.pdf', flavor='stream', table_areas=['316,499,566,337'])
@ -226,6 +224,27 @@ Table areas that you want Camelot to analyze can be passed as a list of comma-se
.. csv-table:: .. csv-table::
:file: ../_static/csv/table_areas.csv :file: ../_static/csv/table_areas.csv
Specify table regions
---------------------
However there may be cases like `[1] <../_static/pdf/table_regions.pdf>`__ and `[2] <https://github.com/socialcopsdev/camelot/blob/master/tests/files/tableception.pdf>`__, where the table might not lie at the exact coordinates every time but in an approximate region.
You can use the ``table_regions`` keyword argument to :meth:`read_pdf() <camelot.read_pdf>` to solve for such cases. When ``table_regions`` is specified, Camelot will only analyze the specified regions to look for tables.
::
>>> tables = camelot.read_pdf('table_regions.pdf', table_regions=['170,370,560,270'])
>>> tables[0].df
.. tip::
Here's how you can do the same with the :ref:`command-line interface <cli>`.
::
$ camelot lattice -R 170,370,560,270 table_regions.pdf
.. csv-table::
:file: ../_static/csv/table_regions.csv
Specify column separators Specify column separators
------------------------- -------------------------
@ -434,11 +453,11 @@ You can pass ``row_tol=<+int>`` to group the rows closer together, as shown belo
Detect short lines Detect short lines
------------------ ------------------
There might be cases while using :ref:`Lattice <lattice>` when smaller lines don't get detected. The size of the smallest line that gets detected is calculated by dividing the PDF page's dimensions with a scaling factor called ``line_size_scaling``. By default, its value is 15. There might be cases while using :ref:`Lattice <lattice>` when smaller lines don't get detected. The size of the smallest line that gets detected is calculated by dividing the PDF page's dimensions with a scaling factor called ``line_scale``. By default, its value is 15.
As you can guess, the larger the ``line_size_scaling``, the smaller the size of lines getting detected. As you can guess, the larger the ``line_scale``, the smaller the size of lines getting detected.
.. warning:: Making ``line_size_scaling`` very large (>150) will lead to text getting detected as lines. .. warning:: Making ``line_scale`` very large (>150) will lead to text getting detected as lines.
Here's a `PDF <../_static/pdf/short_lines.pdf>`__ where small lines separating the the headers don't get detected with the default value of 15. Here's a `PDF <../_static/pdf/short_lines.pdf>`__ where small lines separating the the headers don't get detected with the default value of 15.
@ -458,11 +477,11 @@ Let's plot the table for this PDF.
:alt: A plot of the PDF table with short lines :alt: A plot of the PDF table with short lines
:align: left :align: left
Clearly, the smaller lines separating the headers, couldn't be detected. Let's try with ``line_size_scaling=40``, and plot the table again. Clearly, the smaller lines separating the headers, couldn't be detected. Let's try with ``line_scale=40``, and plot the table again.
:: ::
>>> tables = camelot.read_pdf('short_lines.pdf', line_size_scaling=40) >>> tables = camelot.read_pdf('short_lines.pdf', line_scale=40)
>>> camelot.plot(tables[0], kind='grid') >>> camelot.plot(tables[0], kind='grid')
>>> plt.show() >>> plt.show()
@ -511,7 +530,7 @@ We'll use the `PDF <../_static/pdf/short_lines.pdf>`__ from the previous example
:: ::
>>> tables = camelot.read_pdf('short_lines.pdf', line_size_scaling=40, shift_text=['']) >>> tables = camelot.read_pdf('short_lines.pdf', line_scale=40, shift_text=[''])
>>> tables[0].df >>> tables[0].df
.. csv-table:: .. csv-table::
@ -532,7 +551,7 @@ No surprises there — it did remain in place (observe the strings "2400" and "A
:: ::
>>> tables = camelot.read_pdf('short_lines.pdf', line_size_scaling=40, shift_text=['r', 'b']) >>> tables = camelot.read_pdf('short_lines.pdf', line_scale=40, shift_text=['r', 'b'])
>>> tables[0].df >>> tables[0].df
.. tip:: .. tip::

View File

@ -14,7 +14,7 @@ Begin by importing the Camelot module::
>>> import camelot >>> import camelot
Now, let's try to read a PDF. (You can check out the PDF used in this example `here`_.) Since the PDF has a table with clearly demarcated lines, we will use the :ref:`Lattice <lattice>` method here. To do that, we will set the ``mesh`` keyword argument to ``True``. Now, let's try to read a PDF. (You can check out the PDF used in this example `here`_.) Since the PDF has a table with clearly demarcated lines, we will use the :ref:`Lattice <lattice>` method here.
.. note:: :ref:`Lattice <lattice>` is used by default. You can use :ref:`Stream <stream>` with ``flavor='stream'``. .. note:: :ref:`Lattice <lattice>` is used by default. You can use :ref:`Stream <stream>` with ``flavor='stream'``.
@ -56,7 +56,7 @@ Woah! The accuracy is top-notch and there is less whitespace, which means the ta
.. csv-table:: .. csv-table::
:file: ../_static/csv/foo.csv :file: ../_static/csv/foo.csv
Looks good! You can now export the table as a CSV file using its :meth:`to_csv() <camelot.core.Table.to_csv>` method. Alternatively you can use :meth:`to_json() <camelot.core.Table.to_json>`, :meth:`to_excel() <camelot.core.Table.to_excel>` or :meth:`to_html() <camelot.core.Table.to_html>` methods to export the table as JSON, Excel and HTML files respectively. Looks good! You can now export the table as a CSV file using its :meth:`to_csv() <camelot.core.Table.to_csv>` method. Alternatively you can use :meth:`to_json() <camelot.core.Table.to_json>`, :meth:`to_excel() <camelot.core.Table.to_excel>` :meth:`to_html() <camelot.core.Table.to_html>` or :meth:`to_sqlite() <camelot.core.Table.to_sqlite>` methods to export the table as JSON, Excel, HTML files or a sqlite database respectively.
:: ::
@ -76,7 +76,7 @@ You can also export all tables at once, using the :class:`tables <camelot.core.T
$ camelot --format csv --output foo.csv lattice foo.pdf $ camelot --format csv --output foo.csv lattice foo.pdf
This will export all tables as CSV files at the path specified. Alternatively, you can use ``f='json'``, ``f='excel'`` or ``f='html'``. This will export all tables as CSV files at the path specified. Alternatively, you can use ``f='json'``, ``f='excel'``, ``f='html'`` or ``f='sqlite'``.
.. note:: The :meth:`export() <camelot.core.TableList.export>` method exports files with a ``page-*-table-*`` suffix. In the example above, the single table in the list will be exported to ``foo-page-1-table-1.csv``. If the list contains multiple tables, multiple CSV files will be created. To avoid filling up your path with multiple files, you can use ``compress=True``, which will create a single ZIP file at your path with all the CSV files. .. note:: The :meth:`export() <camelot.core.TableList.export>` method exports files with a ``page-*-table-*`` suffix. In the example above, the single table in the list will be exported to ``foo-page-1-table-1.csv``. If the list contains multiple tables, multiple CSV files will be created. To avoid filling up your path with multiple files, you can use ``compress=True``, which will create a single ZIP file at your path with all the CSV files.

View File

@ -427,6 +427,13 @@ data_lattice_two_tables_2 = [
["Pooled", "23889", "47.7", "1.5", "9.9", "19.9", "17.8", "3.3"] ["Pooled", "23889", "47.7", "1.5", "9.9", "19.9", "17.8", "3.3"]
] ]
data_lattice_table_regions = [
['Età dellAssicurato \nallepoca del decesso', 'Misura % di \nmaggiorazione'],
['18-75', '1,00%'],
['76-80', '0,50%'],
['81 in poi', '0,10%']
]
data_lattice_table_areas = [ data_lattice_table_areas = [
["", "", "", "", "", "", "", "", ""], ["", "", "", "", "", "", "", "", ""],
["State", "n", "Literacy Status", "", "", "", "", "", ""], ["State", "n", "Literacy Status", "", "", "", "", "", ""],

Binary file not shown.

Binary file not shown.

View File

@ -159,6 +159,14 @@ def test_lattice_two_tables():
assert df2.equals(tables[1].df) assert df2.equals(tables[1].df)
def test_lattice_table_regions():
df = pd.DataFrame(data_lattice_table_regions)
filename = os.path.join(testdir, "table_region.pdf")
tables = camelot.read_pdf(filename, table_regions=["170,370,560,270"])
assert df.equals(tables[0].df)
def test_lattice_table_areas(): def test_lattice_table_areas():
df = pd.DataFrame(data_lattice_table_areas) df = pd.DataFrame(data_lattice_table_areas)
@ -179,7 +187,7 @@ def test_lattice_copy_text():
df = pd.DataFrame(data_lattice_copy_text) df = pd.DataFrame(data_lattice_copy_text)
filename = os.path.join(testdir, "row_span_1.pdf") filename = os.path.join(testdir, "row_span_1.pdf")
tables = camelot.read_pdf(filename, line_size_scaling=60, copy_text="v") tables = camelot.read_pdf(filename, line_scale=60, copy_text="v")
assert df.equals(tables[0].df) assert df.equals(tables[0].df)
@ -189,13 +197,13 @@ def test_lattice_shift_text():
df_rb = pd.DataFrame(data_lattice_shift_text_right_bottom) df_rb = pd.DataFrame(data_lattice_shift_text_right_bottom)
filename = os.path.join(testdir, "column_span_2.pdf") filename = os.path.join(testdir, "column_span_2.pdf")
tables = camelot.read_pdf(filename, line_size_scaling=40) tables = camelot.read_pdf(filename, line_scale=40)
assert df_lt.equals(tables[0].df) assert df_lt.equals(tables[0].df)
tables = camelot.read_pdf(filename, line_size_scaling=40, shift_text=['']) tables = camelot.read_pdf(filename, line_scale=40, shift_text=[''])
assert df_disable.equals(tables[0].df) assert df_disable.equals(tables[0].df)
tables = camelot.read_pdf(filename, line_size_scaling=40, shift_text=['r', 'b']) tables = camelot.read_pdf(filename, line_scale=40, shift_text=['r', 'b'])
assert df_rb.equals(tables[0].df) assert df_rb.equals(tables[0].df)
@ -207,6 +215,32 @@ def test_repr():
assert repr(tables[0].cells[0][0]) == "<Cell x1=120.48 y1=218.43 x2=164.64 y2=233.77>" assert repr(tables[0].cells[0][0]) == "<Cell x1=120.48 y1=218.43 x2=164.64 y2=233.77>"
def test_pages():
url = "https://camelot-py.readthedocs.io/en/master/_static/pdf/foo.pdf"
tables = camelot.read_pdf(url)
assert repr(tables) == "<TableList n=1>"
assert repr(tables[0]) == "<Table shape=(7, 7)>"
assert repr(tables[0].cells[0][0]) == "<Cell x1=120.48 y1=218.43 x2=164.64 y2=233.77>"
tables = camelot.read_pdf(url, pages='1-end')
assert repr(tables) == "<TableList n=1>"
assert repr(tables[0]) == "<Table shape=(7, 7)>"
assert repr(tables[0].cells[0][0]) == "<Cell x1=120.48 y1=218.43 x2=164.64 y2=233.77>"
tables = camelot.read_pdf(url, pages='all')
assert repr(tables) == "<TableList n=1>"
assert repr(tables[0]) == "<Table shape=(7, 7)>"
assert repr(tables[0].cells[0][0]) == "<Cell x1=120.48 y1=218.43 x2=164.64 y2=233.77>"
def test_url():
url = "https://camelot-py.readthedocs.io/en/master/_static/pdf/foo.pdf"
tables = camelot.read_pdf(url)
assert repr(tables) == "<TableList n=1>"
assert repr(tables[0]) == "<Table shape=(7, 7)>"
assert repr(tables[0].cells[0][0]) == "<Cell x1=120.48 y1=218.43 x2=164.64 y2=233.77>"
def test_arabic(): def test_arabic():
df = pd.DataFrame(data_arabic) df = pd.DataFrame(data_arabic)

View File

@ -41,6 +41,15 @@ def test_stream_equal_length():
table_areas=['10,20,30,40'], columns=['10,20,30,40', '10,20,30,40']) table_areas=['10,20,30,40'], columns=['10,20,30,40', '10,20,30,40'])
def test_image_warning():
filename = os.path.join(testdir, 'image.pdf')
with warnings.catch_warnings():
warnings.simplefilter('error')
with pytest.raises(UserWarning) as e:
tables = camelot.read_pdf(filename)
assert str(e.value) == 'page-1 is image-based, camelot only works on text-based pages.'
def test_no_tables_found(): def test_no_tables_found():
filename = os.path.join(testdir, 'blank.pdf') filename = os.path.join(testdir, 'blank.pdf')
with warnings.catch_warnings(): with warnings.catch_warnings():