Merge branch 'master' of github.com:socialcopsdev/camelot into replace-gs-c-api
commit
ab5391c76f
12
HISTORY.md
12
HISTORY.md
|
|
@ -6,6 +6,18 @@ master
|
||||||
|
|
||||||
**Improvements**
|
**Improvements**
|
||||||
|
|
||||||
|
* [#240](https://github.com/socialcopsdev/camelot/issues/209) Add support to analyze only certain page regions to look for tables. [#243](https://github.com/socialcopsdev/camelot/pull/243) by Vinayak Mehta.
|
||||||
|
* You can use `table_regions` in `read_pdf()` to specify approximate page regions which may contain tables.
|
||||||
|
* Kwarg `line_size_scaling` is now called `line_scale`.
|
||||||
|
* [#212](https://github.com/socialcopsdev/camelot/issues/212) Add support to export as sqlite database. [#244](https://github.com/socialcopsdev/camelot/pull/244) by Vinayak Mehta.
|
||||||
|
* [#239](https://github.com/socialcopsdev/camelot/issues/239) Raise warning if PDF is image-based. [#240](https://github.com/socialcopsdev/camelot/pull/240) by Vinayak Mehta.
|
||||||
|
|
||||||
|
0.6.0 (2018-12-24)
|
||||||
|
------------------
|
||||||
|
|
||||||
|
**Improvements**
|
||||||
|
|
||||||
|
* [#91](https://github.com/socialcopsdev/camelot/issues/91) Add support to read from url. [#236](https://github.com/socialcopsdev/camelot/pull/236) by Vinayak Mehta.
|
||||||
* [#229](https://github.com/socialcopsdev/camelot/issues/229), [#230](https://github.com/socialcopsdev/camelot/issues/230) and [#233](https://github.com/socialcopsdev/camelot/issues/233) New configuration parameters. [#234](https://github.com/socialcopsdev/camelot/pull/234) by Vinayak Mehta.
|
* [#229](https://github.com/socialcopsdev/camelot/issues/229), [#230](https://github.com/socialcopsdev/camelot/issues/230) and [#233](https://github.com/socialcopsdev/camelot/issues/233) New configuration parameters. [#234](https://github.com/socialcopsdev/camelot/pull/234) by Vinayak Mehta.
|
||||||
* `strip_text`: To define characters that should be stripped from each string.
|
* `strip_text`: To define characters that should be stripped from each string.
|
||||||
* `edge_tol`: Tolerance parameter for extending textedges vertically.
|
* `edge_tol`: Tolerance parameter for extending textedges vertically.
|
||||||
|
|
|
||||||
|
|
@ -21,7 +21,7 @@
|
||||||
>>> tables = camelot.read_pdf('foo.pdf')
|
>>> tables = camelot.read_pdf('foo.pdf')
|
||||||
>>> tables
|
>>> tables
|
||||||
<TableList n=1>
|
<TableList n=1>
|
||||||
>>> tables.export('foo.csv', f='csv', compress=True) # json, excel, html
|
>>> tables.export('foo.csv', f='csv', compress=True) # json, excel, html, sqlite
|
||||||
>>> tables[0]
|
>>> tables[0]
|
||||||
<Table shape=(7, 7)>
|
<Table shape=(7, 7)>
|
||||||
>>> tables[0].parsing_report
|
>>> tables[0].parsing_report
|
||||||
|
|
@ -31,7 +31,7 @@
|
||||||
'order': 1,
|
'order': 1,
|
||||||
'page': 1
|
'page': 1
|
||||||
}
|
}
|
||||||
>>> tables[0].to_csv('foo.csv') # to_json, to_excel, to_html
|
>>> tables[0].to_csv('foo.csv') # to_json, to_excel, to_html, to_sqlite
|
||||||
>>> tables[0].df # get a pandas DataFrame!
|
>>> tables[0].df # get a pandas DataFrame!
|
||||||
</pre>
|
</pre>
|
||||||
|
|
||||||
|
|
@ -53,7 +53,7 @@ There's a [command-line interface](https://camelot-py.readthedocs.io/en/master/u
|
||||||
- **You are in control.**: Unlike other libraries and tools which either give a nice output or fail miserably (with no in-between), Camelot gives you the power to tweak table extraction. (This is important since everything in the real world, including PDF table extraction, is fuzzy.)
|
- **You are in control.**: Unlike other libraries and tools which either give a nice output or fail miserably (with no in-between), Camelot gives you the power to tweak table extraction. (This is important since everything in the real world, including PDF table extraction, is fuzzy.)
|
||||||
- *Bad* tables can be discarded based on **metrics** like accuracy and whitespace, without ever having to manually look at each table.
|
- *Bad* tables can be discarded based on **metrics** like accuracy and whitespace, without ever having to manually look at each table.
|
||||||
- Each table is a **pandas DataFrame**, which seamlessly integrates into [ETL and data analysis workflows](https://gist.github.com/vinayak-mehta/e5949f7c2410a0e12f25d3682dc9e873).
|
- Each table is a **pandas DataFrame**, which seamlessly integrates into [ETL and data analysis workflows](https://gist.github.com/vinayak-mehta/e5949f7c2410a0e12f25d3682dc9e873).
|
||||||
- **Export** to multiple formats, including JSON, Excel and HTML.
|
- **Export** to multiple formats, including JSON, Excel, HTML and Sqlite.
|
||||||
|
|
||||||
See [comparison with other PDF table extraction libraries and tools](https://github.com/socialcopsdev/camelot/wiki/Comparison-with-other-PDF-Table-Extraction-libraries-and-tools).
|
See [comparison with other PDF table extraction libraries and tools](https://github.com/socialcopsdev/camelot/wiki/Comparison-with-other-PDF-Table-Extraction-libraries-and-tools).
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1,6 +1,6 @@
|
||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
VERSION = (0, 5, 0)
|
VERSION = (0, 6, 0)
|
||||||
PRERELEASE = None # alpha, beta or rc
|
PRERELEASE = None # alpha, beta or rc
|
||||||
REVISION = None
|
REVISION = None
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -32,11 +32,11 @@ pass_config = click.make_pass_decorator(Config)
|
||||||
@click.version_option(version=__version__)
|
@click.version_option(version=__version__)
|
||||||
@click.option('-q', '--quiet', is_flag=False, help='Suppress logs and warnings.')
|
@click.option('-q', '--quiet', is_flag=False, help='Suppress logs and warnings.')
|
||||||
@click.option('-p', '--pages', default='1', help='Comma-separated page numbers.'
|
@click.option('-p', '--pages', default='1', help='Comma-separated page numbers.'
|
||||||
' Example: 1,3,4 or 1,4-end.')
|
' Example: 1,3,4 or 1,4-end or all.')
|
||||||
@click.option('-pw', '--password', help='Password for decryption.')
|
@click.option('-pw', '--password', help='Password for decryption.')
|
||||||
@click.option('-o', '--output', help='Output file path.')
|
@click.option('-o', '--output', help='Output file path.')
|
||||||
@click.option('-f', '--format',
|
@click.option('-f', '--format',
|
||||||
type=click.Choice(['csv', 'json', 'excel', 'html']),
|
type=click.Choice(['csv', 'json', 'excel', 'html', 'sqlite']),
|
||||||
help='Output file format.')
|
help='Output file format.')
|
||||||
@click.option('-z', '--zip', is_flag=True, help='Create ZIP archive.')
|
@click.option('-z', '--zip', is_flag=True, help='Create ZIP archive.')
|
||||||
@click.option('-split', '--split_text', is_flag=True,
|
@click.option('-split', '--split_text', is_flag=True,
|
||||||
|
|
@ -56,12 +56,15 @@ def cli(ctx, *args, **kwargs):
|
||||||
|
|
||||||
|
|
||||||
@cli.command('lattice')
|
@cli.command('lattice')
|
||||||
|
@click.option('-R', '--table_regions', default=[], multiple=True,
|
||||||
|
help='Page regions to analyze. Example: x1,y1,x2,y2'
|
||||||
|
' where x1, y1 -> left-top and x2, y2 -> right-bottom.')
|
||||||
@click.option('-T', '--table_areas', default=[], multiple=True,
|
@click.option('-T', '--table_areas', default=[], multiple=True,
|
||||||
help='Table areas to process. Example: x1,y1,x2,y2'
|
help='Table areas to process. Example: x1,y1,x2,y2'
|
||||||
' where x1, y1 -> left-top and x2, y2 -> right-bottom.')
|
' where x1, y1 -> left-top and x2, y2 -> right-bottom.')
|
||||||
@click.option('-back', '--process_background', is_flag=True,
|
@click.option('-back', '--process_background', is_flag=True,
|
||||||
help='Process background lines.')
|
help='Process background lines.')
|
||||||
@click.option('-scale', '--line_size_scaling', default=15,
|
@click.option('-scale', '--line_scale', default=15,
|
||||||
help='Line size scaling factor. The larger the value,'
|
help='Line size scaling factor. The larger the value,'
|
||||||
' the smaller the detected lines.')
|
' the smaller the detected lines.')
|
||||||
@click.option('-copy', '--copy_text', default=[], type=click.Choice(['h', 'v']),
|
@click.option('-copy', '--copy_text', default=[], type=click.Choice(['h', 'v']),
|
||||||
|
|
@ -105,6 +108,8 @@ def lattice(c, *args, **kwargs):
|
||||||
filepath = kwargs.pop('filepath')
|
filepath = kwargs.pop('filepath')
|
||||||
kwargs.update(conf)
|
kwargs.update(conf)
|
||||||
|
|
||||||
|
table_regions = list(kwargs['table_regions'])
|
||||||
|
kwargs['table_regions'] = None if not table_regions else table_regions
|
||||||
table_areas = list(kwargs['table_areas'])
|
table_areas = list(kwargs['table_areas'])
|
||||||
kwargs['table_areas'] = None if not table_areas else table_areas
|
kwargs['table_areas'] = None if not table_areas else table_areas
|
||||||
copy_text = list(kwargs['copy_text'])
|
copy_text = list(kwargs['copy_text'])
|
||||||
|
|
@ -132,6 +137,9 @@ def lattice(c, *args, **kwargs):
|
||||||
|
|
||||||
|
|
||||||
@cli.command('stream')
|
@cli.command('stream')
|
||||||
|
@click.option('-R', '--table_regions', default=[], multiple=True,
|
||||||
|
help='Page regions to analyze. Example: x1,y1,x2,y2'
|
||||||
|
' where x1, y1 -> left-top and x2, y2 -> right-bottom.')
|
||||||
@click.option('-T', '--table_areas', default=[], multiple=True,
|
@click.option('-T', '--table_areas', default=[], multiple=True,
|
||||||
help='Table areas to process. Example: x1,y1,x2,y2'
|
help='Table areas to process. Example: x1,y1,x2,y2'
|
||||||
' where x1, y1 -> left-top and x2, y2 -> right-bottom.')
|
' where x1, y1 -> left-top and x2, y2 -> right-bottom.')
|
||||||
|
|
@ -160,6 +168,8 @@ def stream(c, *args, **kwargs):
|
||||||
filepath = kwargs.pop('filepath')
|
filepath = kwargs.pop('filepath')
|
||||||
kwargs.update(conf)
|
kwargs.update(conf)
|
||||||
|
|
||||||
|
table_regions = list(kwargs['table_regions'])
|
||||||
|
kwargs['table_regions'] = None if not table_regions else table_regions
|
||||||
table_areas = list(kwargs['table_areas'])
|
table_areas = list(kwargs['table_areas'])
|
||||||
kwargs['table_areas'] = None if not table_areas else table_areas
|
kwargs['table_areas'] = None if not table_areas else table_areas
|
||||||
columns = list(kwargs['columns'])
|
columns = list(kwargs['columns'])
|
||||||
|
|
|
||||||
|
|
@ -1,6 +1,7 @@
|
||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
import os
|
import os
|
||||||
|
import sqlite3
|
||||||
import zipfile
|
import zipfile
|
||||||
import tempfile
|
import tempfile
|
||||||
from itertools import chain
|
from itertools import chain
|
||||||
|
|
@ -592,6 +593,28 @@ class Table(object):
|
||||||
with open(path, 'w') as f:
|
with open(path, 'w') as f:
|
||||||
f.write(html_string)
|
f.write(html_string)
|
||||||
|
|
||||||
|
def to_sqlite(self, path, **kwargs):
|
||||||
|
"""Writes Table to sqlite database.
|
||||||
|
|
||||||
|
For kwargs, check :meth:`pandas.DataFrame.to_sql`.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
path : str
|
||||||
|
Output filepath.
|
||||||
|
|
||||||
|
"""
|
||||||
|
kw = {
|
||||||
|
'if_exists': 'replace',
|
||||||
|
'index': False
|
||||||
|
}
|
||||||
|
kw.update(kwargs)
|
||||||
|
conn = sqlite3.connect(path)
|
||||||
|
table_name = 'page-{}-table-{}'.format(self.page, self.order)
|
||||||
|
self.df.to_sql(table_name, conn, **kw)
|
||||||
|
conn.commit()
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
|
||||||
class TableList(object):
|
class TableList(object):
|
||||||
"""Defines a list of camelot.core.Table objects. Each table can
|
"""Defines a list of camelot.core.Table objects. Each table can
|
||||||
|
|
@ -656,7 +679,7 @@ class TableList(object):
|
||||||
path : str
|
path : str
|
||||||
Output filepath.
|
Output filepath.
|
||||||
f : str
|
f : str
|
||||||
File format. Can be csv, json, excel and html.
|
File format. Can be csv, json, excel, html and sqlite.
|
||||||
compress : bool
|
compress : bool
|
||||||
Whether or not to add files to a ZIP archive.
|
Whether or not to add files to a ZIP archive.
|
||||||
|
|
||||||
|
|
@ -689,3 +712,11 @@ class TableList(object):
|
||||||
zipname = os.path.join(os.path.dirname(path), root) + '.zip'
|
zipname = os.path.join(os.path.dirname(path), root) + '.zip'
|
||||||
with zipfile.ZipFile(zipname, 'w', allowZip64=True) as z:
|
with zipfile.ZipFile(zipname, 'w', allowZip64=True) as z:
|
||||||
z.write(filepath, os.path.basename(filepath))
|
z.write(filepath, os.path.basename(filepath))
|
||||||
|
elif f == 'sqlite':
|
||||||
|
filepath = os.path.join(dirname, basename)
|
||||||
|
for table in self._tables:
|
||||||
|
table.to_sqlite(filepath)
|
||||||
|
if compress:
|
||||||
|
zipname = os.path.join(os.path.dirname(path), root) + '.zip'
|
||||||
|
with zipfile.ZipFile(zipname, 'w', allowZip64=True) as z:
|
||||||
|
z.write(filepath, os.path.basename(filepath))
|
||||||
|
|
|
||||||
|
|
@ -8,7 +8,7 @@ from PyPDF2 import PdfFileReader, PdfFileWriter
|
||||||
from .core import TableList
|
from .core import TableList
|
||||||
from .parsers import Stream, Lattice
|
from .parsers import Stream, Lattice
|
||||||
from .utils import (TemporaryDirectory, get_page_layout, get_text_objects,
|
from .utils import (TemporaryDirectory, get_page_layout, get_text_objects,
|
||||||
get_rotation)
|
get_rotation, is_url, download_url)
|
||||||
|
|
||||||
|
|
||||||
class PDFHandler(object):
|
class PDFHandler(object):
|
||||||
|
|
@ -18,20 +18,22 @@ class PDFHandler(object):
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
filename : str
|
filepath : str
|
||||||
Path to PDF file.
|
Filepath or URL of the PDF file.
|
||||||
pages : str, optional (default: '1')
|
pages : str, optional (default: '1')
|
||||||
Comma-separated page numbers.
|
Comma-separated page numbers.
|
||||||
Example: '1,3,4' or '1,4-end'.
|
Example: '1,3,4' or '1,4-end' or 'all'.
|
||||||
password : str, optional (default: None)
|
password : str, optional (default: None)
|
||||||
Password for decryption.
|
Password for decryption.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
def __init__(self, filename, pages='1', password=None):
|
def __init__(self, filepath, pages='1', password=None):
|
||||||
self.filename = filename
|
if is_url(filepath):
|
||||||
if not filename.lower().endswith('.pdf'):
|
filepath = download_url(filepath)
|
||||||
|
self.filepath = filepath
|
||||||
|
if not filepath.lower().endswith('.pdf'):
|
||||||
raise NotImplementedError("File format not supported")
|
raise NotImplementedError("File format not supported")
|
||||||
self.pages = self._get_pages(self.filename, pages)
|
self.pages = self._get_pages(self.filepath, pages)
|
||||||
if password is None:
|
if password is None:
|
||||||
self.password = ''
|
self.password = ''
|
||||||
else:
|
else:
|
||||||
|
|
@ -39,16 +41,16 @@ class PDFHandler(object):
|
||||||
if sys.version_info[0] < 3:
|
if sys.version_info[0] < 3:
|
||||||
self.password = self.password.encode('ascii')
|
self.password = self.password.encode('ascii')
|
||||||
|
|
||||||
def _get_pages(self, filename, pages):
|
def _get_pages(self, filepath, pages):
|
||||||
"""Converts pages string to list of ints.
|
"""Converts pages string to list of ints.
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
filename : str
|
filepath : str
|
||||||
Path to PDF file.
|
Filepath or URL of the PDF file.
|
||||||
pages : str, optional (default: '1')
|
pages : str, optional (default: '1')
|
||||||
Comma-separated page numbers.
|
Comma-separated page numbers.
|
||||||
Example: 1,3,4 or 1,4-end.
|
Example: '1,3,4' or '1,4-end' or 'all'.
|
||||||
|
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
|
|
@ -60,7 +62,7 @@ class PDFHandler(object):
|
||||||
if pages == '1':
|
if pages == '1':
|
||||||
page_numbers.append({'start': 1, 'end': 1})
|
page_numbers.append({'start': 1, 'end': 1})
|
||||||
else:
|
else:
|
||||||
infile = PdfFileReader(open(filename, 'rb'), strict=False)
|
infile = PdfFileReader(open(filepath, 'rb'), strict=False)
|
||||||
if infile.isEncrypted:
|
if infile.isEncrypted:
|
||||||
infile.decrypt(self.password)
|
infile.decrypt(self.password)
|
||||||
if pages == 'all':
|
if pages == 'all':
|
||||||
|
|
@ -79,20 +81,20 @@ class PDFHandler(object):
|
||||||
P.extend(range(p['start'], p['end'] + 1))
|
P.extend(range(p['start'], p['end'] + 1))
|
||||||
return sorted(set(P))
|
return sorted(set(P))
|
||||||
|
|
||||||
def _save_page(self, filename, page, temp):
|
def _save_page(self, filepath, page, temp):
|
||||||
"""Saves specified page from PDF into a temporary directory.
|
"""Saves specified page from PDF into a temporary directory.
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
filename : str
|
filepath : str
|
||||||
Path to PDF file.
|
Filepath or URL of the PDF file.
|
||||||
page : int
|
page : int
|
||||||
Page number.
|
Page number.
|
||||||
temp : str
|
temp : str
|
||||||
Tmp directory.
|
Tmp directory.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
with open(filename, 'rb') as fileobj:
|
with open(filepath, 'rb') as fileobj:
|
||||||
infile = PdfFileReader(fileobj, strict=False)
|
infile = PdfFileReader(fileobj, strict=False)
|
||||||
if infile.isEncrypted:
|
if infile.isEncrypted:
|
||||||
infile.decrypt(self.password)
|
infile.decrypt(self.password)
|
||||||
|
|
@ -105,10 +107,10 @@ class PDFHandler(object):
|
||||||
outfile.write(f)
|
outfile.write(f)
|
||||||
layout, dim = get_page_layout(fpath)
|
layout, dim = get_page_layout(fpath)
|
||||||
# fix rotated PDF
|
# fix rotated PDF
|
||||||
lttextlh = get_text_objects(layout, ltype="lh")
|
chars = get_text_objects(layout, ltype="char")
|
||||||
lttextlv = get_text_objects(layout, ltype="lv")
|
horizontal_text = get_text_objects(layout, ltype="horizontal_text")
|
||||||
ltchar = get_text_objects(layout, ltype="char")
|
vertical_text = get_text_objects(layout, ltype="vertical_text")
|
||||||
rotation = get_rotation(lttextlh, lttextlv, ltchar)
|
rotation = get_rotation(chars, horizontal_text, vertical_text)
|
||||||
if rotation != '':
|
if rotation != '':
|
||||||
fpath_new = ''.join([froot.replace('page', 'p'), '_rotated', fext])
|
fpath_new = ''.join([froot.replace('page', 'p'), '_rotated', fext])
|
||||||
os.rename(fpath, fpath_new)
|
os.rename(fpath, fpath_new)
|
||||||
|
|
@ -150,7 +152,7 @@ class PDFHandler(object):
|
||||||
tables = []
|
tables = []
|
||||||
with TemporaryDirectory() as tempdir:
|
with TemporaryDirectory() as tempdir:
|
||||||
for p in self.pages:
|
for p in self.pages:
|
||||||
self._save_page(self.filename, p, tempdir)
|
self._save_page(self.filepath, p, tempdir)
|
||||||
pages = [os.path.join(tempdir, 'page-{0}.pdf'.format(p))
|
pages = [os.path.join(tempdir, 'page-{0}.pdf'.format(p))
|
||||||
for p in self.pages]
|
for p in self.pages]
|
||||||
parser = Lattice(**kwargs) if flavor == 'lattice' else Stream(**kwargs)
|
parser = Lattice(**kwargs) if flavor == 'lattice' else Stream(**kwargs)
|
||||||
|
|
|
||||||
|
|
@ -48,7 +48,8 @@ def adaptive_threshold(imagename, process_background=False, blocksize=15, c=-2):
|
||||||
return img, threshold
|
return img, threshold
|
||||||
|
|
||||||
|
|
||||||
def find_lines(threshold, direction='horizontal', line_size_scaling=15, iterations=0):
|
def find_lines(threshold, regions=None, direction='horizontal',
|
||||||
|
line_scale=15, iterations=0):
|
||||||
"""Finds horizontal and vertical lines by applying morphological
|
"""Finds horizontal and vertical lines by applying morphological
|
||||||
transformations on an image.
|
transformations on an image.
|
||||||
|
|
||||||
|
|
@ -56,9 +57,13 @@ def find_lines(threshold, direction='horizontal', line_size_scaling=15, iteratio
|
||||||
----------
|
----------
|
||||||
threshold : object
|
threshold : object
|
||||||
numpy.ndarray representing the thresholded image.
|
numpy.ndarray representing the thresholded image.
|
||||||
|
regions : list, optional (default: None)
|
||||||
|
List of page regions that may contain tables of the form x1,y1,x2,y2
|
||||||
|
where (x1, y1) -> left-top and (x2, y2) -> right-bottom
|
||||||
|
in image coordinate space.
|
||||||
direction : string, optional (default: 'horizontal')
|
direction : string, optional (default: 'horizontal')
|
||||||
Specifies whether to find vertical or horizontal lines.
|
Specifies whether to find vertical or horizontal lines.
|
||||||
line_size_scaling : int, optional (default: 15)
|
line_scale : int, optional (default: 15)
|
||||||
Factor by which the page dimensions will be divided to get
|
Factor by which the page dimensions will be divided to get
|
||||||
smallest length of lines that should be detected.
|
smallest length of lines that should be detected.
|
||||||
|
|
||||||
|
|
@ -83,26 +88,33 @@ def find_lines(threshold, direction='horizontal', line_size_scaling=15, iteratio
|
||||||
lines = []
|
lines = []
|
||||||
|
|
||||||
if direction == 'vertical':
|
if direction == 'vertical':
|
||||||
size = threshold.shape[0] // line_size_scaling
|
size = threshold.shape[0] // line_scale
|
||||||
el = cv2.getStructuringElement(cv2.MORPH_RECT, (1, size))
|
el = cv2.getStructuringElement(cv2.MORPH_RECT, (1, size))
|
||||||
elif direction == 'horizontal':
|
elif direction == 'horizontal':
|
||||||
size = threshold.shape[1] // line_size_scaling
|
size = threshold.shape[1] // line_scale
|
||||||
el = cv2.getStructuringElement(cv2.MORPH_RECT, (size, 1))
|
el = cv2.getStructuringElement(cv2.MORPH_RECT, (size, 1))
|
||||||
elif direction is None:
|
elif direction is None:
|
||||||
raise ValueError("Specify direction as either 'vertical' or"
|
raise ValueError("Specify direction as either 'vertical' or"
|
||||||
" 'horizontal'")
|
" 'horizontal'")
|
||||||
|
|
||||||
|
if regions is not None:
|
||||||
|
region_mask = np.zeros(threshold.shape)
|
||||||
|
for region in regions:
|
||||||
|
x, y, w, h = region
|
||||||
|
region_mask[y : y + h, x : x + w] = 1
|
||||||
|
threshold = np.multiply(threshold, region_mask)
|
||||||
|
|
||||||
threshold = cv2.erode(threshold, el)
|
threshold = cv2.erode(threshold, el)
|
||||||
threshold = cv2.dilate(threshold, el)
|
threshold = cv2.dilate(threshold, el)
|
||||||
dmask = cv2.dilate(threshold, el, iterations=iterations)
|
dmask = cv2.dilate(threshold, el, iterations=iterations)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
_, contours, _ = cv2.findContours(
|
_, contours, _ = cv2.findContours(
|
||||||
threshold, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
threshold.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
||||||
except ValueError:
|
except ValueError:
|
||||||
# for opencv backward compatibility
|
# for opencv backward compatibility
|
||||||
contours, _ = cv2.findContours(
|
contours, _ = cv2.findContours(
|
||||||
threshold, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
threshold.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
||||||
|
|
||||||
for c in contours:
|
for c in contours:
|
||||||
x, y, w, h = cv2.boundingRect(c)
|
x, y, w, h = cv2.boundingRect(c)
|
||||||
|
|
@ -116,7 +128,7 @@ def find_lines(threshold, direction='horizontal', line_size_scaling=15, iteratio
|
||||||
return dmask, lines
|
return dmask, lines
|
||||||
|
|
||||||
|
|
||||||
def find_table_contours(vertical, horizontal):
|
def find_contours(vertical, horizontal):
|
||||||
"""Finds table boundaries using OpenCV's findContours.
|
"""Finds table boundaries using OpenCV's findContours.
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
|
|
@ -138,11 +150,12 @@ def find_table_contours(vertical, horizontal):
|
||||||
|
|
||||||
try:
|
try:
|
||||||
__, contours, __ = cv2.findContours(
|
__, contours, __ = cv2.findContours(
|
||||||
mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
mask.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
||||||
except ValueError:
|
except ValueError:
|
||||||
# for opencv backward compatibility
|
# for opencv backward compatibility
|
||||||
contours, __ = cv2.findContours(
|
contours, __ = cv2.findContours(
|
||||||
mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
mask.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
||||||
|
# sort in reverse based on contour area and use first 10 contours
|
||||||
contours = sorted(contours, key=cv2.contourArea, reverse=True)[:10]
|
contours = sorted(contours, key=cv2.contourArea, reverse=True)[:10]
|
||||||
|
|
||||||
cont = []
|
cont = []
|
||||||
|
|
@ -153,7 +166,7 @@ def find_table_contours(vertical, horizontal):
|
||||||
return cont
|
return cont
|
||||||
|
|
||||||
|
|
||||||
def find_table_joints(contours, vertical, horizontal):
|
def find_joints(contours, vertical, horizontal):
|
||||||
"""Finds joints/intersections present inside each table boundary.
|
"""Finds joints/intersections present inside each table boundary.
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
|
|
@ -176,18 +189,18 @@ def find_table_joints(contours, vertical, horizontal):
|
||||||
and (x2, y2) -> rt in image coordinate space.
|
and (x2, y2) -> rt in image coordinate space.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
joints = np.bitwise_and(vertical, horizontal)
|
joints = np.multiply(vertical, horizontal)
|
||||||
tables = {}
|
tables = {}
|
||||||
for c in contours:
|
for c in contours:
|
||||||
x, y, w, h = c
|
x, y, w, h = c
|
||||||
roi = joints[y : y + h, x : x + w]
|
roi = joints[y : y + h, x : x + w]
|
||||||
try:
|
try:
|
||||||
__, jc, __ = cv2.findContours(
|
__, jc, __ = cv2.findContours(
|
||||||
roi, cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE)
|
roi.astype(np.uint8), cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE)
|
||||||
except ValueError:
|
except ValueError:
|
||||||
# for opencv backward compatibility
|
# for opencv backward compatibility
|
||||||
jc, __ = cv2.findContours(
|
jc, __ = cv2.findContours(
|
||||||
roi, cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE)
|
roi.astype(np.uint8), cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE)
|
||||||
if len(jc) <= 4: # remove contours with less than 4 joints
|
if len(jc) <= 4: # remove contours with less than 4 joints
|
||||||
continue
|
continue
|
||||||
joint_coords = []
|
joint_coords = []
|
||||||
|
|
|
||||||
|
|
@ -1,4 +1,5 @@
|
||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
import warnings
|
import warnings
|
||||||
|
|
||||||
from .handlers import PDFHandler
|
from .handlers import PDFHandler
|
||||||
|
|
@ -15,10 +16,10 @@ def read_pdf(filepath, pages='1', password=None, flavor='lattice',
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
filepath : str
|
filepath : str
|
||||||
Path to PDF file.
|
Filepath or URL of the PDF file.
|
||||||
pages : str, optional (default: '1')
|
pages : str, optional (default: '1')
|
||||||
Comma-separated page numbers.
|
Comma-separated page numbers.
|
||||||
Example: '1,3,4' or '1,4-end'.
|
Example: '1,3,4' or '1,4-end' or 'all'.
|
||||||
password : str, optional (default: None)
|
password : str, optional (default: None)
|
||||||
Password for decryption.
|
Password for decryption.
|
||||||
flavor : str (default: 'lattice')
|
flavor : str (default: 'lattice')
|
||||||
|
|
@ -51,7 +52,7 @@ def read_pdf(filepath, pages='1', password=None, flavor='lattice',
|
||||||
to generate columns.
|
to generate columns.
|
||||||
process_background* : bool, optional (default: False)
|
process_background* : bool, optional (default: False)
|
||||||
Process background lines.
|
Process background lines.
|
||||||
line_size_scaling* : int, optional (default: 15)
|
line_scale* : int, optional (default: 15)
|
||||||
Line size scaling factor. The larger the value the smaller
|
Line size scaling factor. The larger the value the smaller
|
||||||
the detected lines. Making it very large will lead to text
|
the detected lines. Making it very large will lead to text
|
||||||
being detected as lines.
|
being detected as lines.
|
||||||
|
|
|
||||||
|
|
@ -13,7 +13,8 @@ class BaseParser(object):
|
||||||
self.layout_kwargs = layout_kwargs
|
self.layout_kwargs = layout_kwargs
|
||||||
self.layout, self.dimensions = get_page_layout(
|
self.layout, self.dimensions = get_page_layout(
|
||||||
filename, **layout_kwargs)
|
filename, **layout_kwargs)
|
||||||
self.horizontal_text = get_text_objects(self.layout, ltype="lh")
|
self.images = get_text_objects(self.layout, ltype='image')
|
||||||
self.vertical_text = get_text_objects(self.layout, ltype="lv")
|
self.horizontal_text = get_text_objects(self.layout, ltype='horizontal_text')
|
||||||
|
self.vertical_text = get_text_objects(self.layout, ltype='vertical_text')
|
||||||
self.pdf_width, self.pdf_height = self.dimensions
|
self.pdf_width, self.pdf_height = self.dimensions
|
||||||
self.rootname, __ = os.path.splitext(self.filename)
|
self.rootname, __ = os.path.splitext(self.filename)
|
||||||
|
|
|
||||||
|
|
@ -19,7 +19,7 @@ from ..utils import (scale_image, scale_pdf, segments_in_bbox, text_in_bbox,
|
||||||
merge_close_lines, get_table_index, compute_accuracy,
|
merge_close_lines, get_table_index, compute_accuracy,
|
||||||
compute_whitespace)
|
compute_whitespace)
|
||||||
from ..image_processing import (adaptive_threshold, find_lines,
|
from ..image_processing import (adaptive_threshold, find_lines,
|
||||||
find_table_contours, find_table_joints)
|
find_contours, find_joints)
|
||||||
|
|
||||||
|
|
||||||
logger = logging.getLogger('camelot')
|
logger = logging.getLogger('camelot')
|
||||||
|
|
@ -31,13 +31,17 @@ class Lattice(BaseParser):
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
|
table_regions : list, optional (default: None)
|
||||||
|
List of page regions that may contain tables of the form x1,y1,x2,y2
|
||||||
|
where (x1, y1) -> left-top and (x2, y2) -> right-bottom
|
||||||
|
in PDF coordinate space.
|
||||||
table_areas : list, optional (default: None)
|
table_areas : list, optional (default: None)
|
||||||
List of table area strings of the form x1,y1,x2,y2
|
List of table area strings of the form x1,y1,x2,y2
|
||||||
where (x1, y1) -> left-top and (x2, y2) -> right-bottom
|
where (x1, y1) -> left-top and (x2, y2) -> right-bottom
|
||||||
in PDF coordinate space.
|
in PDF coordinate space.
|
||||||
process_background : bool, optional (default: False)
|
process_background : bool, optional (default: False)
|
||||||
Process background lines.
|
Process background lines.
|
||||||
line_size_scaling : int, optional (default: 15)
|
line_scale : int, optional (default: 15)
|
||||||
Line size scaling factor. The larger the value the smaller
|
Line size scaling factor. The larger the value the smaller
|
||||||
the detected lines. Making it very large will lead to text
|
the detected lines. Making it very large will lead to text
|
||||||
being detected as lines.
|
being detected as lines.
|
||||||
|
|
@ -80,14 +84,15 @@ class Lattice(BaseParser):
|
||||||
Resolution used for PDF to PNG conversion.
|
Resolution used for PDF to PNG conversion.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
def __init__(self, table_areas=None, process_background=False,
|
def __init__(self, table_regions=None, table_areas=None, process_background=False,
|
||||||
line_size_scaling=15, copy_text=None, shift_text=['l', 't'],
|
line_scale=15, copy_text=None, shift_text=['l', 't'],
|
||||||
split_text=False, flag_size=False, strip_text='', line_tol=2,
|
split_text=False, flag_size=False, strip_text='', line_tol=2,
|
||||||
joint_tol=2, threshold_blocksize=15, threshold_constant=-2,
|
joint_tol=2, threshold_blocksize=15, threshold_constant=-2,
|
||||||
iterations=0, resolution=300, **kwargs):
|
iterations=0, resolution=300, **kwargs):
|
||||||
|
self.table_regions = table_regions
|
||||||
self.table_areas = table_areas
|
self.table_areas = table_areas
|
||||||
self.process_background = process_background
|
self.process_background = process_background
|
||||||
self.line_size_scaling = line_size_scaling
|
self.line_scale = line_scale
|
||||||
self.copy_text = copy_text
|
self.copy_text = copy_text
|
||||||
self.shift_text = shift_text
|
self.shift_text = shift_text
|
||||||
self.split_text = split_text
|
self.split_text = split_text
|
||||||
|
|
@ -189,9 +194,22 @@ class Lattice(BaseParser):
|
||||||
null.close()
|
null.close()
|
||||||
|
|
||||||
def _generate_table_bbox(self):
|
def _generate_table_bbox(self):
|
||||||
|
def scale_areas(areas):
|
||||||
|
scaled_areas = []
|
||||||
|
for area in areas:
|
||||||
|
x1, y1, x2, y2 = area.split(",")
|
||||||
|
x1 = float(x1)
|
||||||
|
y1 = float(y1)
|
||||||
|
x2 = float(x2)
|
||||||
|
y2 = float(y2)
|
||||||
|
x1, y1, x2, y2 = scale_pdf((x1, y1, x2, y2), image_scalers)
|
||||||
|
scaled_areas.append((x1, y1, abs(x2 - x1), abs(y2 - y1)))
|
||||||
|
return scaled_areas
|
||||||
|
|
||||||
self.image, self.threshold = adaptive_threshold(
|
self.image, self.threshold = adaptive_threshold(
|
||||||
self.imagename, process_background=self.process_background,
|
self.imagename, process_background=self.process_background,
|
||||||
blocksize=self.threshold_blocksize, c=self.threshold_constant)
|
blocksize=self.threshold_blocksize, c=self.threshold_constant)
|
||||||
|
|
||||||
image_width = self.image.shape[1]
|
image_width = self.image.shape[1]
|
||||||
image_height = self.image.shape[0]
|
image_height = self.image.shape[0]
|
||||||
image_width_scaler = image_width / float(self.pdf_width)
|
image_width_scaler = image_width / float(self.pdf_width)
|
||||||
|
|
@ -201,27 +219,30 @@ class Lattice(BaseParser):
|
||||||
image_scalers = (image_width_scaler, image_height_scaler, self.pdf_height)
|
image_scalers = (image_width_scaler, image_height_scaler, self.pdf_height)
|
||||||
pdf_scalers = (pdf_width_scaler, pdf_height_scaler, image_height)
|
pdf_scalers = (pdf_width_scaler, pdf_height_scaler, image_height)
|
||||||
|
|
||||||
vertical_mask, vertical_segments = find_lines(
|
if self.table_areas is None:
|
||||||
self.threshold, direction='vertical',
|
regions = None
|
||||||
line_size_scaling=self.line_size_scaling, iterations=self.iterations)
|
if self.table_regions is not None:
|
||||||
horizontal_mask, horizontal_segments = find_lines(
|
regions = scale_areas(self.table_regions)
|
||||||
self.threshold, direction='horizontal',
|
|
||||||
line_size_scaling=self.line_size_scaling, iterations=self.iterations)
|
|
||||||
|
|
||||||
if self.table_areas is not None:
|
vertical_mask, vertical_segments = find_lines(
|
||||||
areas = []
|
self.threshold, regions=regions, direction='vertical',
|
||||||
for area in self.table_areas:
|
line_scale=self.line_scale, iterations=self.iterations)
|
||||||
x1, y1, x2, y2 = area.split(",")
|
horizontal_mask, horizontal_segments = find_lines(
|
||||||
x1 = float(x1)
|
self.threshold, regions=regions, direction='horizontal',
|
||||||
y1 = float(y1)
|
line_scale=self.line_scale, iterations=self.iterations)
|
||||||
x2 = float(x2)
|
|
||||||
y2 = float(y2)
|
contours = find_contours(vertical_mask, horizontal_mask)
|
||||||
x1, y1, x2, y2 = scale_pdf((x1, y1, x2, y2), image_scalers)
|
table_bbox = find_joints(contours, vertical_mask, horizontal_mask)
|
||||||
areas.append((x1, y1, abs(x2 - x1), abs(y2 - y1)))
|
|
||||||
table_bbox = find_table_joints(areas, vertical_mask, horizontal_mask)
|
|
||||||
else:
|
else:
|
||||||
contours = find_table_contours(vertical_mask, horizontal_mask)
|
vertical_mask, vertical_segments = find_lines(
|
||||||
table_bbox = find_table_joints(contours, vertical_mask, horizontal_mask)
|
self.threshold, direction='vertical', line_scale=self.line_scale,
|
||||||
|
iterations=self.iterations)
|
||||||
|
horizontal_mask, horizontal_segments = find_lines(
|
||||||
|
self.threshold, direction='horizontal', line_scale=self.line_scale,
|
||||||
|
iterations=self.iterations)
|
||||||
|
|
||||||
|
areas = scale_areas(self.table_areas)
|
||||||
|
table_bbox = find_joints(areas, vertical_mask, horizontal_mask)
|
||||||
|
|
||||||
self.table_bbox_unscaled = copy.deepcopy(table_bbox)
|
self.table_bbox_unscaled = copy.deepcopy(table_bbox)
|
||||||
|
|
||||||
|
|
@ -318,8 +339,12 @@ class Lattice(BaseParser):
|
||||||
logger.info('Processing {}'.format(os.path.basename(self.rootname)))
|
logger.info('Processing {}'.format(os.path.basename(self.rootname)))
|
||||||
|
|
||||||
if not self.horizontal_text:
|
if not self.horizontal_text:
|
||||||
warnings.warn("No tables found on {}".format(
|
if self.images:
|
||||||
os.path.basename(self.rootname)))
|
warnings.warn('{} is image-based, camelot only works on'
|
||||||
|
' text-based pages.'.format(os.path.basename(self.rootname)))
|
||||||
|
else:
|
||||||
|
warnings.warn('No tables found on {}'.format(
|
||||||
|
os.path.basename(self.rootname)))
|
||||||
return []
|
return []
|
||||||
|
|
||||||
self._generate_image()
|
self._generate_image()
|
||||||
|
|
|
||||||
|
|
@ -26,6 +26,10 @@ class Stream(BaseParser):
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
|
table_regions : list, optional (default: None)
|
||||||
|
List of page regions that may contain tables of the form x1,y1,x2,y2
|
||||||
|
where (x1, y1) -> left-top and (x2, y2) -> right-bottom
|
||||||
|
in PDF coordinate space.
|
||||||
table_areas : list, optional (default: None)
|
table_areas : list, optional (default: None)
|
||||||
List of table area strings of the form x1,y1,x2,y2
|
List of table area strings of the form x1,y1,x2,y2
|
||||||
where (x1, y1) -> left-top and (x2, y2) -> right-bottom
|
where (x1, y1) -> left-top and (x2, y2) -> right-bottom
|
||||||
|
|
@ -51,9 +55,10 @@ class Stream(BaseParser):
|
||||||
to generate columns.
|
to generate columns.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
def __init__(self, table_areas=None, columns=None, split_text=False,
|
def __init__(self, table_regions=None, table_areas=None, columns=None, split_text=False,
|
||||||
flag_size=False, strip_text='', edge_tol=50, row_tol=2,
|
flag_size=False, strip_text='', edge_tol=50, row_tol=2,
|
||||||
column_tol=0, **kwargs):
|
column_tol=0, **kwargs):
|
||||||
|
self.table_regions = table_regions
|
||||||
self.table_areas = table_areas
|
self.table_areas = table_areas
|
||||||
self.columns = columns
|
self.columns = columns
|
||||||
self._validate_columns()
|
self._validate_columns()
|
||||||
|
|
@ -275,7 +280,18 @@ class Stream(BaseParser):
|
||||||
|
|
||||||
def _generate_table_bbox(self):
|
def _generate_table_bbox(self):
|
||||||
self.textedges = []
|
self.textedges = []
|
||||||
if self.table_areas is not None:
|
if self.table_areas is None:
|
||||||
|
hor_text = self.horizontal_text
|
||||||
|
if self.table_regions is not None:
|
||||||
|
# filter horizontal text
|
||||||
|
hor_text = []
|
||||||
|
for region in self.table_regions:
|
||||||
|
x1, y1, x2, y2 = region
|
||||||
|
region_text = text_in_bbox((x1, y2, x2, y1), self.horizontal_text)
|
||||||
|
hor_text.extend(region_text)
|
||||||
|
# find tables based on nurminen's detection algorithm
|
||||||
|
table_bbox = self._nurminen_table_detection(hor_text)
|
||||||
|
else:
|
||||||
table_bbox = {}
|
table_bbox = {}
|
||||||
for area in self.table_areas:
|
for area in self.table_areas:
|
||||||
x1, y1, x2, y2 = area.split(",")
|
x1, y1, x2, y2 = area.split(",")
|
||||||
|
|
@ -284,9 +300,6 @@ class Stream(BaseParser):
|
||||||
x2 = float(x2)
|
x2 = float(x2)
|
||||||
y2 = float(y2)
|
y2 = float(y2)
|
||||||
table_bbox[(x1, y2, x2, y1)] = None
|
table_bbox[(x1, y2, x2, y1)] = None
|
||||||
else:
|
|
||||||
# find tables based on nurminen's detection algorithm
|
|
||||||
table_bbox = self._nurminen_table_detection(self.horizontal_text)
|
|
||||||
self.table_bbox = table_bbox
|
self.table_bbox = table_bbox
|
||||||
|
|
||||||
def _generate_columns_and_rows(self, table_idx, tk):
|
def _generate_columns_and_rows(self, table_idx, tk):
|
||||||
|
|
@ -395,8 +408,12 @@ class Stream(BaseParser):
|
||||||
logger.info('Processing {}'.format(os.path.basename(self.rootname)))
|
logger.info('Processing {}'.format(os.path.basename(self.rootname)))
|
||||||
|
|
||||||
if not self.horizontal_text:
|
if not self.horizontal_text:
|
||||||
warnings.warn("No tables found on {}".format(
|
if self.images:
|
||||||
os.path.basename(self.rootname)))
|
warnings.warn('{} is image-based, camelot only works on'
|
||||||
|
' text-based pages.'.format(os.path.basename(self.rootname)))
|
||||||
|
else:
|
||||||
|
warnings.warn('No tables found on {}'.format(
|
||||||
|
os.path.basename(self.rootname)))
|
||||||
return []
|
return []
|
||||||
|
|
||||||
self._generate_table_bbox()
|
self._generate_table_bbox()
|
||||||
|
|
|
||||||
106
camelot/utils.py
106
camelot/utils.py
|
|
@ -1,12 +1,17 @@
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
from __future__ import division
|
from __future__ import division
|
||||||
|
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import random
|
||||||
import shutil
|
import shutil
|
||||||
|
import string
|
||||||
import tempfile
|
import tempfile
|
||||||
import warnings
|
import warnings
|
||||||
from itertools import groupby
|
from itertools import groupby
|
||||||
from operator import itemgetter
|
from operator import itemgetter
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
from pdfminer.pdfparser import PDFParser
|
from pdfminer.pdfparser import PDFParser
|
||||||
from pdfminer.pdfdocument import PDFDocument
|
from pdfminer.pdfdocument import PDFDocument
|
||||||
from pdfminer.pdfpage import PDFPage
|
from pdfminer.pdfpage import PDFPage
|
||||||
|
|
@ -15,7 +20,78 @@ from pdfminer.pdfinterp import PDFResourceManager
|
||||||
from pdfminer.pdfinterp import PDFPageInterpreter
|
from pdfminer.pdfinterp import PDFPageInterpreter
|
||||||
from pdfminer.converter import PDFPageAggregator
|
from pdfminer.converter import PDFPageAggregator
|
||||||
from pdfminer.layout import (LAParams, LTAnno, LTChar, LTTextLineHorizontal,
|
from pdfminer.layout import (LAParams, LTAnno, LTChar, LTTextLineHorizontal,
|
||||||
LTTextLineVertical)
|
LTTextLineVertical, LTImage)
|
||||||
|
|
||||||
|
|
||||||
|
PY3 = sys.version_info[0] >= 3
|
||||||
|
if PY3:
|
||||||
|
from urllib.request import urlopen
|
||||||
|
from urllib.parse import urlparse as parse_url
|
||||||
|
from urllib.parse import uses_relative, uses_netloc, uses_params
|
||||||
|
else:
|
||||||
|
from urllib2 import urlopen
|
||||||
|
from urlparse import urlparse as parse_url
|
||||||
|
from urlparse import uses_relative, uses_netloc, uses_params
|
||||||
|
|
||||||
|
|
||||||
|
_VALID_URLS = set(uses_relative + uses_netloc + uses_params)
|
||||||
|
_VALID_URLS.discard('')
|
||||||
|
|
||||||
|
|
||||||
|
# https://github.com/pandas-dev/pandas/blob/master/pandas/io/common.py
|
||||||
|
def is_url(url):
|
||||||
|
"""Check to see if a URL has a valid protocol.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
url : str or unicode
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
isurl : bool
|
||||||
|
If url has a valid protocol return True otherwise False.
|
||||||
|
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
return parse_url(url).scheme in _VALID_URLS
|
||||||
|
except Exception:
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def random_string(length):
|
||||||
|
ret = ''
|
||||||
|
while length:
|
||||||
|
ret += random.choice(string.digits + string.ascii_lowercase + string.ascii_uppercase)
|
||||||
|
length -= 1
|
||||||
|
return ret
|
||||||
|
|
||||||
|
|
||||||
|
def download_url(url):
|
||||||
|
"""Download file from specified URL.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
url : str or unicode
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
filepath : str or unicode
|
||||||
|
Temporary filepath.
|
||||||
|
|
||||||
|
"""
|
||||||
|
filename = '{}.pdf'.format(random_string(6))
|
||||||
|
with tempfile.NamedTemporaryFile('wb', delete=False) as f:
|
||||||
|
obj = urlopen(url)
|
||||||
|
if PY3:
|
||||||
|
content_type = obj.info().get_content_type()
|
||||||
|
else:
|
||||||
|
content_type = obj.info().getheader('Content-Type')
|
||||||
|
if content_type != 'application/pdf':
|
||||||
|
raise NotImplementedError("File format not supported")
|
||||||
|
f.write(obj.read())
|
||||||
|
filepath = os.path.join(os.path.dirname(f.name), filename)
|
||||||
|
shutil.move(f.name, filepath)
|
||||||
|
return filepath
|
||||||
|
|
||||||
|
|
||||||
stream_kwargs = [
|
stream_kwargs = [
|
||||||
|
|
@ -25,7 +101,7 @@ stream_kwargs = [
|
||||||
]
|
]
|
||||||
lattice_kwargs = [
|
lattice_kwargs = [
|
||||||
'process_background',
|
'process_background',
|
||||||
'line_size_scaling',
|
'line_scale',
|
||||||
'copy_text',
|
'copy_text',
|
||||||
'shift_text',
|
'shift_text',
|
||||||
'line_tol',
|
'line_tol',
|
||||||
|
|
@ -194,15 +270,15 @@ def scale_image(tables, v_segments, h_segments, factors):
|
||||||
return tables_new, v_segments_new, h_segments_new
|
return tables_new, v_segments_new, h_segments_new
|
||||||
|
|
||||||
|
|
||||||
def get_rotation(lttextlh, lttextlv, ltchar):
|
def get_rotation(chars, horizontal_text, vertical_text):
|
||||||
"""Detects if text in table is rotated or not using the current
|
"""Detects if text in table is rotated or not using the current
|
||||||
transformation matrix (CTM) and returns its orientation.
|
transformation matrix (CTM) and returns its orientation.
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
lttextlh : list
|
horizontal_text : list
|
||||||
List of PDFMiner LTTextLineHorizontal objects.
|
List of PDFMiner LTTextLineHorizontal objects.
|
||||||
lttextlv : list
|
vertical_text : list
|
||||||
List of PDFMiner LTTextLineVertical objects.
|
List of PDFMiner LTTextLineVertical objects.
|
||||||
ltchar : list
|
ltchar : list
|
||||||
List of PDFMiner LTChar objects.
|
List of PDFMiner LTChar objects.
|
||||||
|
|
@ -216,11 +292,11 @@ def get_rotation(lttextlh, lttextlv, ltchar):
|
||||||
|
|
||||||
"""
|
"""
|
||||||
rotation = ''
|
rotation = ''
|
||||||
hlen = len([t for t in lttextlh if t.get_text().strip()])
|
hlen = len([t for t in horizontal_text if t.get_text().strip()])
|
||||||
vlen = len([t for t in lttextlv if t.get_text().strip()])
|
vlen = len([t for t in vertical_text if t.get_text().strip()])
|
||||||
if hlen < vlen:
|
if hlen < vlen:
|
||||||
clockwise = sum(t.matrix[1] < 0 and t.matrix[2] > 0 for t in ltchar)
|
clockwise = sum(t.matrix[1] < 0 and t.matrix[2] > 0 for t in chars)
|
||||||
anticlockwise = sum(t.matrix[1] > 0 and t.matrix[2] < 0 for t in ltchar)
|
anticlockwise = sum(t.matrix[1] > 0 and t.matrix[2] < 0 for t in chars)
|
||||||
rotation = 'anticlockwise' if clockwise < anticlockwise else 'clockwise'
|
rotation = 'anticlockwise' if clockwise < anticlockwise else 'clockwise'
|
||||||
return rotation
|
return rotation
|
||||||
|
|
||||||
|
|
@ -263,7 +339,7 @@ def text_in_bbox(bbox, text):
|
||||||
----------
|
----------
|
||||||
bbox : tuple
|
bbox : tuple
|
||||||
Tuple (x1, y1, x2, y2) representing a bounding box where
|
Tuple (x1, y1, x2, y2) representing a bounding box where
|
||||||
(x1, y1) -> lb and (x2, y2) -> rt in PDFMiner coordinate
|
(x1, y1) -> lb and (x2, y2) -> rt in the PDF coordinate
|
||||||
space.
|
space.
|
||||||
text : List of PDFMiner text objects.
|
text : List of PDFMiner text objects.
|
||||||
|
|
||||||
|
|
@ -637,11 +713,13 @@ def get_text_objects(layout, ltype="char", t=None):
|
||||||
List of PDFMiner text objects.
|
List of PDFMiner text objects.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
if ltype == "char":
|
if ltype == 'char':
|
||||||
LTObject = LTChar
|
LTObject = LTChar
|
||||||
elif ltype == "lh":
|
elif ltype == 'image':
|
||||||
|
LTObject = LTImage
|
||||||
|
elif ltype == 'horizontal_text':
|
||||||
LTObject = LTTextLineHorizontal
|
LTObject = LTTextLineHorizontal
|
||||||
elif ltype == "lv":
|
elif ltype == 'vertical_text':
|
||||||
LTObject = LTTextLineVertical
|
LTObject = LTTextLineVertical
|
||||||
if t is None:
|
if t is None:
|
||||||
t = []
|
t = []
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,4 @@
|
||||||
|
"Età dell’Assicuratoall’epoca del decesso","Misura % dimaggiorazione"
|
||||||
|
"18-75","1,00%"
|
||||||
|
"76-80","0,50%"
|
||||||
|
"81 in poi","0,10%"
|
||||||
|
Binary file not shown.
|
|
@ -206,12 +206,10 @@ You can also visualize the textedges found on a page by specifying ``kind='texte
|
||||||
Specify table areas
|
Specify table areas
|
||||||
-------------------
|
-------------------
|
||||||
|
|
||||||
In cases such as `these <../_static/pdf/table_areas.pdf>`__, it can be useful to specify table boundaries. You can plot the text on this page and note the top left and bottom right coordinates of the table.
|
In cases such as `these <../_static/pdf/table_areas.pdf>`__, it can be useful to specify exact table boundaries. You can plot the text on this page and note the top left and bottom right coordinates of the table.
|
||||||
|
|
||||||
Table areas that you want Camelot to analyze can be passed as a list of comma-separated strings to :meth:`read_pdf() <camelot.read_pdf>`, using the ``table_areas`` keyword argument.
|
Table areas that you want Camelot to analyze can be passed as a list of comma-separated strings to :meth:`read_pdf() <camelot.read_pdf>`, using the ``table_areas`` keyword argument.
|
||||||
|
|
||||||
.. _for now: https://github.com/socialcopsdev/camelot/issues/102
|
|
||||||
|
|
||||||
::
|
::
|
||||||
|
|
||||||
>>> tables = camelot.read_pdf('table_areas.pdf', flavor='stream', table_areas=['316,499,566,337'])
|
>>> tables = camelot.read_pdf('table_areas.pdf', flavor='stream', table_areas=['316,499,566,337'])
|
||||||
|
|
@ -226,6 +224,27 @@ Table areas that you want Camelot to analyze can be passed as a list of comma-se
|
||||||
.. csv-table::
|
.. csv-table::
|
||||||
:file: ../_static/csv/table_areas.csv
|
:file: ../_static/csv/table_areas.csv
|
||||||
|
|
||||||
|
Specify table regions
|
||||||
|
---------------------
|
||||||
|
|
||||||
|
However there may be cases like `[1] <../_static/pdf/table_regions.pdf>`__ and `[2] <https://github.com/socialcopsdev/camelot/blob/master/tests/files/tableception.pdf>`__, where the table might not lie at the exact coordinates every time but in an approximate region.
|
||||||
|
|
||||||
|
You can use the ``table_regions`` keyword argument to :meth:`read_pdf() <camelot.read_pdf>` to solve for such cases. When ``table_regions`` is specified, Camelot will only analyze the specified regions to look for tables.
|
||||||
|
|
||||||
|
::
|
||||||
|
|
||||||
|
>>> tables = camelot.read_pdf('table_regions.pdf', table_regions=['170,370,560,270'])
|
||||||
|
>>> tables[0].df
|
||||||
|
|
||||||
|
.. tip::
|
||||||
|
Here's how you can do the same with the :ref:`command-line interface <cli>`.
|
||||||
|
::
|
||||||
|
|
||||||
|
$ camelot lattice -R 170,370,560,270 table_regions.pdf
|
||||||
|
|
||||||
|
.. csv-table::
|
||||||
|
:file: ../_static/csv/table_regions.csv
|
||||||
|
|
||||||
Specify column separators
|
Specify column separators
|
||||||
-------------------------
|
-------------------------
|
||||||
|
|
||||||
|
|
@ -434,11 +453,11 @@ You can pass ``row_tol=<+int>`` to group the rows closer together, as shown belo
|
||||||
Detect short lines
|
Detect short lines
|
||||||
------------------
|
------------------
|
||||||
|
|
||||||
There might be cases while using :ref:`Lattice <lattice>` when smaller lines don't get detected. The size of the smallest line that gets detected is calculated by dividing the PDF page's dimensions with a scaling factor called ``line_size_scaling``. By default, its value is 15.
|
There might be cases while using :ref:`Lattice <lattice>` when smaller lines don't get detected. The size of the smallest line that gets detected is calculated by dividing the PDF page's dimensions with a scaling factor called ``line_scale``. By default, its value is 15.
|
||||||
|
|
||||||
As you can guess, the larger the ``line_size_scaling``, the smaller the size of lines getting detected.
|
As you can guess, the larger the ``line_scale``, the smaller the size of lines getting detected.
|
||||||
|
|
||||||
.. warning:: Making ``line_size_scaling`` very large (>150) will lead to text getting detected as lines.
|
.. warning:: Making ``line_scale`` very large (>150) will lead to text getting detected as lines.
|
||||||
|
|
||||||
Here's a `PDF <../_static/pdf/short_lines.pdf>`__ where small lines separating the the headers don't get detected with the default value of 15.
|
Here's a `PDF <../_static/pdf/short_lines.pdf>`__ where small lines separating the the headers don't get detected with the default value of 15.
|
||||||
|
|
||||||
|
|
@ -458,11 +477,11 @@ Let's plot the table for this PDF.
|
||||||
:alt: A plot of the PDF table with short lines
|
:alt: A plot of the PDF table with short lines
|
||||||
:align: left
|
:align: left
|
||||||
|
|
||||||
Clearly, the smaller lines separating the headers, couldn't be detected. Let's try with ``line_size_scaling=40``, and plot the table again.
|
Clearly, the smaller lines separating the headers, couldn't be detected. Let's try with ``line_scale=40``, and plot the table again.
|
||||||
|
|
||||||
::
|
::
|
||||||
|
|
||||||
>>> tables = camelot.read_pdf('short_lines.pdf', line_size_scaling=40)
|
>>> tables = camelot.read_pdf('short_lines.pdf', line_scale=40)
|
||||||
>>> camelot.plot(tables[0], kind='grid')
|
>>> camelot.plot(tables[0], kind='grid')
|
||||||
>>> plt.show()
|
>>> plt.show()
|
||||||
|
|
||||||
|
|
@ -511,7 +530,7 @@ We'll use the `PDF <../_static/pdf/short_lines.pdf>`__ from the previous example
|
||||||
|
|
||||||
::
|
::
|
||||||
|
|
||||||
>>> tables = camelot.read_pdf('short_lines.pdf', line_size_scaling=40, shift_text=[''])
|
>>> tables = camelot.read_pdf('short_lines.pdf', line_scale=40, shift_text=[''])
|
||||||
>>> tables[0].df
|
>>> tables[0].df
|
||||||
|
|
||||||
.. csv-table::
|
.. csv-table::
|
||||||
|
|
@ -532,7 +551,7 @@ No surprises there — it did remain in place (observe the strings "2400" and "A
|
||||||
|
|
||||||
::
|
::
|
||||||
|
|
||||||
>>> tables = camelot.read_pdf('short_lines.pdf', line_size_scaling=40, shift_text=['r', 'b'])
|
>>> tables = camelot.read_pdf('short_lines.pdf', line_scale=40, shift_text=['r', 'b'])
|
||||||
>>> tables[0].df
|
>>> tables[0].df
|
||||||
|
|
||||||
.. tip::
|
.. tip::
|
||||||
|
|
|
||||||
|
|
@ -14,7 +14,7 @@ Begin by importing the Camelot module::
|
||||||
|
|
||||||
>>> import camelot
|
>>> import camelot
|
||||||
|
|
||||||
Now, let's try to read a PDF. (You can check out the PDF used in this example `here`_.) Since the PDF has a table with clearly demarcated lines, we will use the :ref:`Lattice <lattice>` method here. To do that, we will set the ``mesh`` keyword argument to ``True``.
|
Now, let's try to read a PDF. (You can check out the PDF used in this example `here`_.) Since the PDF has a table with clearly demarcated lines, we will use the :ref:`Lattice <lattice>` method here.
|
||||||
|
|
||||||
.. note:: :ref:`Lattice <lattice>` is used by default. You can use :ref:`Stream <stream>` with ``flavor='stream'``.
|
.. note:: :ref:`Lattice <lattice>` is used by default. You can use :ref:`Stream <stream>` with ``flavor='stream'``.
|
||||||
|
|
||||||
|
|
@ -56,7 +56,7 @@ Woah! The accuracy is top-notch and there is less whitespace, which means the ta
|
||||||
.. csv-table::
|
.. csv-table::
|
||||||
:file: ../_static/csv/foo.csv
|
:file: ../_static/csv/foo.csv
|
||||||
|
|
||||||
Looks good! You can now export the table as a CSV file using its :meth:`to_csv() <camelot.core.Table.to_csv>` method. Alternatively you can use :meth:`to_json() <camelot.core.Table.to_json>`, :meth:`to_excel() <camelot.core.Table.to_excel>` or :meth:`to_html() <camelot.core.Table.to_html>` methods to export the table as JSON, Excel and HTML files respectively.
|
Looks good! You can now export the table as a CSV file using its :meth:`to_csv() <camelot.core.Table.to_csv>` method. Alternatively you can use :meth:`to_json() <camelot.core.Table.to_json>`, :meth:`to_excel() <camelot.core.Table.to_excel>` :meth:`to_html() <camelot.core.Table.to_html>` or :meth:`to_sqlite() <camelot.core.Table.to_sqlite>` methods to export the table as JSON, Excel, HTML files or a sqlite database respectively.
|
||||||
|
|
||||||
::
|
::
|
||||||
|
|
||||||
|
|
@ -76,7 +76,7 @@ You can also export all tables at once, using the :class:`tables <camelot.core.T
|
||||||
|
|
||||||
$ camelot --format csv --output foo.csv lattice foo.pdf
|
$ camelot --format csv --output foo.csv lattice foo.pdf
|
||||||
|
|
||||||
This will export all tables as CSV files at the path specified. Alternatively, you can use ``f='json'``, ``f='excel'`` or ``f='html'``.
|
This will export all tables as CSV files at the path specified. Alternatively, you can use ``f='json'``, ``f='excel'``, ``f='html'`` or ``f='sqlite'``.
|
||||||
|
|
||||||
.. note:: The :meth:`export() <camelot.core.TableList.export>` method exports files with a ``page-*-table-*`` suffix. In the example above, the single table in the list will be exported to ``foo-page-1-table-1.csv``. If the list contains multiple tables, multiple CSV files will be created. To avoid filling up your path with multiple files, you can use ``compress=True``, which will create a single ZIP file at your path with all the CSV files.
|
.. note:: The :meth:`export() <camelot.core.TableList.export>` method exports files with a ``page-*-table-*`` suffix. In the example above, the single table in the list will be exported to ``foo-page-1-table-1.csv``. If the list contains multiple tables, multiple CSV files will be created. To avoid filling up your path with multiple files, you can use ``compress=True``, which will create a single ZIP file at your path with all the CSV files.
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -427,6 +427,13 @@ data_lattice_two_tables_2 = [
|
||||||
["Pooled", "23889", "47.7", "1.5", "9.9", "19.9", "17.8", "3.3"]
|
["Pooled", "23889", "47.7", "1.5", "9.9", "19.9", "17.8", "3.3"]
|
||||||
]
|
]
|
||||||
|
|
||||||
|
data_lattice_table_regions = [
|
||||||
|
['Età dell’Assicurato \nall’epoca del decesso', 'Misura % di \nmaggiorazione'],
|
||||||
|
['18-75', '1,00%'],
|
||||||
|
['76-80', '0,50%'],
|
||||||
|
['81 in poi', '0,10%']
|
||||||
|
]
|
||||||
|
|
||||||
data_lattice_table_areas = [
|
data_lattice_table_areas = [
|
||||||
["", "", "", "", "", "", "", "", ""],
|
["", "", "", "", "", "", "", "", ""],
|
||||||
["State", "n", "Literacy Status", "", "", "", "", "", ""],
|
["State", "n", "Literacy Status", "", "", "", "", "", ""],
|
||||||
|
|
|
||||||
Binary file not shown.
Binary file not shown.
|
|
@ -159,6 +159,14 @@ def test_lattice_two_tables():
|
||||||
assert df2.equals(tables[1].df)
|
assert df2.equals(tables[1].df)
|
||||||
|
|
||||||
|
|
||||||
|
def test_lattice_table_regions():
|
||||||
|
df = pd.DataFrame(data_lattice_table_regions)
|
||||||
|
|
||||||
|
filename = os.path.join(testdir, "table_region.pdf")
|
||||||
|
tables = camelot.read_pdf(filename, table_regions=["170,370,560,270"])
|
||||||
|
assert df.equals(tables[0].df)
|
||||||
|
|
||||||
|
|
||||||
def test_lattice_table_areas():
|
def test_lattice_table_areas():
|
||||||
df = pd.DataFrame(data_lattice_table_areas)
|
df = pd.DataFrame(data_lattice_table_areas)
|
||||||
|
|
||||||
|
|
@ -179,7 +187,7 @@ def test_lattice_copy_text():
|
||||||
df = pd.DataFrame(data_lattice_copy_text)
|
df = pd.DataFrame(data_lattice_copy_text)
|
||||||
|
|
||||||
filename = os.path.join(testdir, "row_span_1.pdf")
|
filename = os.path.join(testdir, "row_span_1.pdf")
|
||||||
tables = camelot.read_pdf(filename, line_size_scaling=60, copy_text="v")
|
tables = camelot.read_pdf(filename, line_scale=60, copy_text="v")
|
||||||
assert df.equals(tables[0].df)
|
assert df.equals(tables[0].df)
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -189,13 +197,13 @@ def test_lattice_shift_text():
|
||||||
df_rb = pd.DataFrame(data_lattice_shift_text_right_bottom)
|
df_rb = pd.DataFrame(data_lattice_shift_text_right_bottom)
|
||||||
|
|
||||||
filename = os.path.join(testdir, "column_span_2.pdf")
|
filename = os.path.join(testdir, "column_span_2.pdf")
|
||||||
tables = camelot.read_pdf(filename, line_size_scaling=40)
|
tables = camelot.read_pdf(filename, line_scale=40)
|
||||||
assert df_lt.equals(tables[0].df)
|
assert df_lt.equals(tables[0].df)
|
||||||
|
|
||||||
tables = camelot.read_pdf(filename, line_size_scaling=40, shift_text=[''])
|
tables = camelot.read_pdf(filename, line_scale=40, shift_text=[''])
|
||||||
assert df_disable.equals(tables[0].df)
|
assert df_disable.equals(tables[0].df)
|
||||||
|
|
||||||
tables = camelot.read_pdf(filename, line_size_scaling=40, shift_text=['r', 'b'])
|
tables = camelot.read_pdf(filename, line_scale=40, shift_text=['r', 'b'])
|
||||||
assert df_rb.equals(tables[0].df)
|
assert df_rb.equals(tables[0].df)
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -207,6 +215,32 @@ def test_repr():
|
||||||
assert repr(tables[0].cells[0][0]) == "<Cell x1=120.48 y1=218.43 x2=164.64 y2=233.77>"
|
assert repr(tables[0].cells[0][0]) == "<Cell x1=120.48 y1=218.43 x2=164.64 y2=233.77>"
|
||||||
|
|
||||||
|
|
||||||
|
def test_pages():
|
||||||
|
url = "https://camelot-py.readthedocs.io/en/master/_static/pdf/foo.pdf"
|
||||||
|
tables = camelot.read_pdf(url)
|
||||||
|
assert repr(tables) == "<TableList n=1>"
|
||||||
|
assert repr(tables[0]) == "<Table shape=(7, 7)>"
|
||||||
|
assert repr(tables[0].cells[0][0]) == "<Cell x1=120.48 y1=218.43 x2=164.64 y2=233.77>"
|
||||||
|
|
||||||
|
tables = camelot.read_pdf(url, pages='1-end')
|
||||||
|
assert repr(tables) == "<TableList n=1>"
|
||||||
|
assert repr(tables[0]) == "<Table shape=(7, 7)>"
|
||||||
|
assert repr(tables[0].cells[0][0]) == "<Cell x1=120.48 y1=218.43 x2=164.64 y2=233.77>"
|
||||||
|
|
||||||
|
tables = camelot.read_pdf(url, pages='all')
|
||||||
|
assert repr(tables) == "<TableList n=1>"
|
||||||
|
assert repr(tables[0]) == "<Table shape=(7, 7)>"
|
||||||
|
assert repr(tables[0].cells[0][0]) == "<Cell x1=120.48 y1=218.43 x2=164.64 y2=233.77>"
|
||||||
|
|
||||||
|
|
||||||
|
def test_url():
|
||||||
|
url = "https://camelot-py.readthedocs.io/en/master/_static/pdf/foo.pdf"
|
||||||
|
tables = camelot.read_pdf(url)
|
||||||
|
assert repr(tables) == "<TableList n=1>"
|
||||||
|
assert repr(tables[0]) == "<Table shape=(7, 7)>"
|
||||||
|
assert repr(tables[0].cells[0][0]) == "<Cell x1=120.48 y1=218.43 x2=164.64 y2=233.77>"
|
||||||
|
|
||||||
|
|
||||||
def test_arabic():
|
def test_arabic():
|
||||||
df = pd.DataFrame(data_arabic)
|
df = pd.DataFrame(data_arabic)
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -41,6 +41,15 @@ def test_stream_equal_length():
|
||||||
table_areas=['10,20,30,40'], columns=['10,20,30,40', '10,20,30,40'])
|
table_areas=['10,20,30,40'], columns=['10,20,30,40', '10,20,30,40'])
|
||||||
|
|
||||||
|
|
||||||
|
def test_image_warning():
|
||||||
|
filename = os.path.join(testdir, 'image.pdf')
|
||||||
|
with warnings.catch_warnings():
|
||||||
|
warnings.simplefilter('error')
|
||||||
|
with pytest.raises(UserWarning) as e:
|
||||||
|
tables = camelot.read_pdf(filename)
|
||||||
|
assert str(e.value) == 'page-1 is image-based, camelot only works on text-based pages.'
|
||||||
|
|
||||||
|
|
||||||
def test_no_tables_found():
|
def test_no_tables_found():
|
||||||
filename = os.path.join(testdir, 'blank.pdf')
|
filename = os.path.join(testdir, 'blank.pdf')
|
||||||
with warnings.catch_warnings():
|
with warnings.catch_warnings():
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue