Merge pull request #146 from camelot-dev/add-python38-travis
[MRG] Fix test data and drop python2 supportpull/166/head
15
.travis.yml
|
|
@ -8,14 +8,6 @@ install:
|
||||||
- make install
|
- make install
|
||||||
jobs:
|
jobs:
|
||||||
include:
|
include:
|
||||||
- stage: test
|
|
||||||
script:
|
|
||||||
- make test
|
|
||||||
python: '2.7'
|
|
||||||
- stage: test
|
|
||||||
script:
|
|
||||||
- make test
|
|
||||||
python: '3.5'
|
|
||||||
- stage: test
|
- stage: test
|
||||||
script:
|
script:
|
||||||
- make test
|
- make test
|
||||||
|
|
@ -25,8 +17,13 @@ jobs:
|
||||||
- make test
|
- make test
|
||||||
python: '3.7'
|
python: '3.7'
|
||||||
dist: xenial
|
dist: xenial
|
||||||
|
- stage: test
|
||||||
|
script:
|
||||||
|
- make test
|
||||||
|
python: '3.8'
|
||||||
|
dist: xenial
|
||||||
- stage: coverage
|
- stage: coverage
|
||||||
python: '3.6'
|
python: '3.8'
|
||||||
script:
|
script:
|
||||||
- make test
|
- make test
|
||||||
- codecov --verbose
|
- codecov --verbose
|
||||||
|
|
|
||||||
|
|
@ -1,7 +1,5 @@
|
||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
from __future__ import absolute_import
|
|
||||||
|
|
||||||
|
|
||||||
__all__ = ("main",)
|
__all__ = ("main",)
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -8,9 +8,9 @@ REVISION = None
|
||||||
def generate_version(version, prerelease=None, revision=None):
|
def generate_version(version, prerelease=None, revision=None):
|
||||||
version_parts = [".".join(map(str, version))]
|
version_parts = [".".join(map(str, version))]
|
||||||
if prerelease is not None:
|
if prerelease is not None:
|
||||||
version_parts.append("-{}".format(prerelease))
|
version_parts.append(f"-{prerelease}")
|
||||||
if revision is not None:
|
if revision is not None:
|
||||||
version_parts.append(".{}".format(revision))
|
version_parts.append(f".{revision}")
|
||||||
return "".join(version_parts)
|
return "".join(version_parts)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -204,7 +204,7 @@ def lattice(c, *args, **kwargs):
|
||||||
tables = read_pdf(
|
tables = read_pdf(
|
||||||
filepath, pages=pages, flavor="lattice", suppress_stdout=quiet, **kwargs
|
filepath, pages=pages, flavor="lattice", suppress_stdout=quiet, **kwargs
|
||||||
)
|
)
|
||||||
click.echo("Found {} tables".format(tables.n))
|
click.echo(f"Found {tables.n} tables")
|
||||||
if plot_type is not None:
|
if plot_type is not None:
|
||||||
for table in tables:
|
for table in tables:
|
||||||
plot(table, kind=plot_type)
|
plot(table, kind=plot_type)
|
||||||
|
|
@ -295,7 +295,7 @@ def stream(c, *args, **kwargs):
|
||||||
tables = read_pdf(
|
tables = read_pdf(
|
||||||
filepath, pages=pages, flavor="stream", suppress_stdout=quiet, **kwargs
|
filepath, pages=pages, flavor="stream", suppress_stdout=quiet, **kwargs
|
||||||
)
|
)
|
||||||
click.echo("Found {} tables".format(tables.n))
|
click.echo(f"Found {tables.n} tables")
|
||||||
if plot_type is not None:
|
if plot_type is not None:
|
||||||
for table in tables:
|
for table in tables:
|
||||||
plot(table, kind=plot_type)
|
plot(table, kind=plot_type)
|
||||||
|
|
|
||||||
|
|
@ -52,13 +52,10 @@ class TextEdge(object):
|
||||||
self.is_valid = False
|
self.is_valid = False
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
return "<TextEdge x={} y0={} y1={} align={} valid={}>".format(
|
x = round(self.x, 2)
|
||||||
round(self.x, 2),
|
y0 = round(self.y0, 2)
|
||||||
round(self.y0, 2),
|
y1 = round(self.y1, 2)
|
||||||
round(self.y1, 2),
|
return f"<TextEdge x={x} y0={y0} y1={y1} align={self.align} valid={self.is_valid}>"
|
||||||
self.align,
|
|
||||||
self.is_valid,
|
|
||||||
)
|
|
||||||
|
|
||||||
def update_coords(self, x, y0, edge_tol=50):
|
def update_coords(self, x, y0, edge_tol=50):
|
||||||
"""Updates the text edge's x and bottom y coordinates and sets
|
"""Updates the text edge's x and bottom y coordinates and sets
|
||||||
|
|
@ -291,9 +288,11 @@ class Cell(object):
|
||||||
self._text = ""
|
self._text = ""
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
return "<Cell x1={} y1={} x2={} y2={}>".format(
|
x1 = round(self.x1, 2)
|
||||||
round(self.x1, 2), round(self.y1, 2), round(self.x2, 2), round(self.y2, 2)
|
y1 = round(self.y1, 2)
|
||||||
)
|
x2 = round(self.x2, 2)
|
||||||
|
y2 = round(self.y2, 2)
|
||||||
|
return f"<Cell x1={x1} y1={y1} x2={x2} y2={y2}>"
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def text(self):
|
def text(self):
|
||||||
|
|
@ -351,7 +350,7 @@ class Table(object):
|
||||||
self.page = None
|
self.page = None
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
return "<{} shape={}>".format(self.__class__.__name__, self.shape)
|
return f"<{self.__class__.__name__} shape={self.shape}>"
|
||||||
|
|
||||||
def __lt__(self, other):
|
def __lt__(self, other):
|
||||||
if self.page == other.page:
|
if self.page == other.page:
|
||||||
|
|
@ -612,7 +611,7 @@ class Table(object):
|
||||||
|
|
||||||
"""
|
"""
|
||||||
kw = {
|
kw = {
|
||||||
"sheet_name": "page-{}-table-{}".format(self.page, self.order),
|
"sheet_name": f"page-{self.page}-table-{self.order}",
|
||||||
"encoding": "utf-8",
|
"encoding": "utf-8",
|
||||||
}
|
}
|
||||||
kw.update(kwargs)
|
kw.update(kwargs)
|
||||||
|
|
@ -649,7 +648,7 @@ class Table(object):
|
||||||
kw = {"if_exists": "replace", "index": False}
|
kw = {"if_exists": "replace", "index": False}
|
||||||
kw.update(kwargs)
|
kw.update(kwargs)
|
||||||
conn = sqlite3.connect(path)
|
conn = sqlite3.connect(path)
|
||||||
table_name = "page-{}-table-{}".format(self.page, self.order)
|
table_name = f"page-{self.page}-table-{self.order}"
|
||||||
self.df.to_sql(table_name, conn, **kw)
|
self.df.to_sql(table_name, conn, **kw)
|
||||||
conn.commit()
|
conn.commit()
|
||||||
conn.close()
|
conn.close()
|
||||||
|
|
@ -670,7 +669,7 @@ class TableList(object):
|
||||||
self._tables = tables
|
self._tables = tables
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
return "<{} n={}>".format(self.__class__.__name__, self.n)
|
return f"<{self.__class__.__name__} n={self.n}>"
|
||||||
|
|
||||||
def __len__(self):
|
def __len__(self):
|
||||||
return len(self._tables)
|
return len(self._tables)
|
||||||
|
|
@ -680,7 +679,7 @@ class TableList(object):
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _format_func(table, f):
|
def _format_func(table, f):
|
||||||
return getattr(table, "to_{}".format(f))
|
return getattr(table, f"to_{f}")
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def n(self):
|
def n(self):
|
||||||
|
|
@ -691,9 +690,7 @@ class TableList(object):
|
||||||
root = kwargs.get("root")
|
root = kwargs.get("root")
|
||||||
ext = kwargs.get("ext")
|
ext = kwargs.get("ext")
|
||||||
for table in self._tables:
|
for table in self._tables:
|
||||||
filename = os.path.join(
|
filename = f"{root}-page-{table.page}-table-{table.order}{ext}"
|
||||||
"{}-page-{}-table-{}{}".format(root, table.page, table.order, ext)
|
|
||||||
)
|
|
||||||
filepath = os.path.join(dirname, filename)
|
filepath = os.path.join(dirname, filename)
|
||||||
to_format = self._format_func(table, f)
|
to_format = self._format_func(table, f)
|
||||||
to_format(filepath)
|
to_format(filepath)
|
||||||
|
|
@ -706,9 +703,7 @@ class TableList(object):
|
||||||
zipname = os.path.join(os.path.dirname(path), root) + ".zip"
|
zipname = os.path.join(os.path.dirname(path), root) + ".zip"
|
||||||
with zipfile.ZipFile(zipname, "w", allowZip64=True) as z:
|
with zipfile.ZipFile(zipname, "w", allowZip64=True) as z:
|
||||||
for table in self._tables:
|
for table in self._tables:
|
||||||
filename = os.path.join(
|
filename = f"{root}-page-{table.page}-table-{table.order}{ext}"
|
||||||
"{}-page-{}-table-{}{}".format(root, table.page, table.order, ext)
|
|
||||||
)
|
|
||||||
filepath = os.path.join(dirname, filename)
|
filepath = os.path.join(dirname, filename)
|
||||||
z.write(filepath, os.path.basename(filepath))
|
z.write(filepath, os.path.basename(filepath))
|
||||||
|
|
||||||
|
|
@ -741,7 +736,7 @@ class TableList(object):
|
||||||
filepath = os.path.join(dirname, basename)
|
filepath = os.path.join(dirname, basename)
|
||||||
writer = pd.ExcelWriter(filepath)
|
writer = pd.ExcelWriter(filepath)
|
||||||
for table in self._tables:
|
for table in self._tables:
|
||||||
sheet_name = "page-{}-table-{}".format(table.page, table.order)
|
sheet_name = f"page-{table.page}-table-{table.order}"
|
||||||
table.df.to_excel(writer, sheet_name=sheet_name, encoding="utf-8")
|
table.df.to_excel(writer, sheet_name=sheet_name, encoding="utf-8")
|
||||||
writer.save()
|
writer.save()
|
||||||
if compress:
|
if compress:
|
||||||
|
|
|
||||||
|
|
@ -81,6 +81,7 @@ def delete_instance(instance):
|
||||||
"""
|
"""
|
||||||
return libgs.gsapi_delete_instance(instance)
|
return libgs.gsapi_delete_instance(instance)
|
||||||
|
|
||||||
|
|
||||||
if sys.platform == "win32":
|
if sys.platform == "win32":
|
||||||
c_stdstream_call_t = WINFUNCTYPE(c_int, gs_main_instance, POINTER(c_char), c_int)
|
c_stdstream_call_t = WINFUNCTYPE(c_int, gs_main_instance, POINTER(c_char), c_int)
|
||||||
else:
|
else:
|
||||||
|
|
@ -247,7 +248,10 @@ if sys.platform == "win32":
|
||||||
libgs = __win32_finddll()
|
libgs = __win32_finddll()
|
||||||
if not libgs:
|
if not libgs:
|
||||||
import ctypes.util
|
import ctypes.util
|
||||||
libgs = ctypes.util.find_library("".join(("gsdll", str(ctypes.sizeof(ctypes.c_voidp) * 8), ".dll"))) # finds in %PATH%
|
|
||||||
|
libgs = ctypes.util.find_library(
|
||||||
|
"".join(("gsdll", str(ctypes.sizeof(ctypes.c_voidp) * 8), ".dll"))
|
||||||
|
) # finds in %PATH%
|
||||||
if not libgs:
|
if not libgs:
|
||||||
raise RuntimeError("Please make sure that Ghostscript is installed")
|
raise RuntimeError("Please make sure that Ghostscript is installed")
|
||||||
libgs = windll.LoadLibrary(libgs)
|
libgs = windll.LoadLibrary(libgs)
|
||||||
|
|
|
||||||
|
|
@ -106,7 +106,7 @@ class PDFHandler(object):
|
||||||
infile = PdfFileReader(fileobj, strict=False)
|
infile = PdfFileReader(fileobj, strict=False)
|
||||||
if infile.isEncrypted:
|
if infile.isEncrypted:
|
||||||
infile.decrypt(self.password)
|
infile.decrypt(self.password)
|
||||||
fpath = os.path.join(temp, "page-{0}.pdf".format(page))
|
fpath = os.path.join(temp, f"page-{page}.pdf")
|
||||||
froot, fext = os.path.splitext(fpath)
|
froot, fext = os.path.splitext(fpath)
|
||||||
p = infile.getPage(page - 1)
|
p = infile.getPage(page - 1)
|
||||||
outfile = PdfFileWriter()
|
outfile = PdfFileWriter()
|
||||||
|
|
@ -164,7 +164,7 @@ class PDFHandler(object):
|
||||||
for p in self.pages:
|
for p in self.pages:
|
||||||
self._save_page(self.filepath, p, tempdir)
|
self._save_page(self.filepath, p, tempdir)
|
||||||
pages = [
|
pages = [
|
||||||
os.path.join(tempdir, "page-{0}.pdf".format(p)) for p in self.pages
|
os.path.join(tempdir, f"page-{p}.pdf") for p in self.pages
|
||||||
]
|
]
|
||||||
parser = Lattice(**kwargs) if flavor == "lattice" else Stream(**kwargs)
|
parser = Lattice(**kwargs) if flavor == "lattice" else Stream(**kwargs)
|
||||||
for p in pages:
|
for p in pages:
|
||||||
|
|
|
||||||
|
|
@ -1,7 +1,5 @@
|
||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
from __future__ import division
|
|
||||||
|
|
||||||
import cv2
|
import cv2
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1,6 +1,5 @@
|
||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
from __future__ import division
|
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
import copy
|
import copy
|
||||||
|
|
|
||||||
|
|
@ -1,6 +1,5 @@
|
||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
from __future__ import division
|
|
||||||
import os
|
import os
|
||||||
import logging
|
import logging
|
||||||
import warnings
|
import warnings
|
||||||
|
|
@ -358,7 +357,7 @@ class Stream(BaseParser):
|
||||||
ncols = max(set(elements), key=elements.count)
|
ncols = max(set(elements), key=elements.count)
|
||||||
else:
|
else:
|
||||||
warnings.warn(
|
warnings.warn(
|
||||||
"No tables found in table area {}".format(table_idx + 1)
|
f"No tables found in table area {table_idx + 1}"
|
||||||
)
|
)
|
||||||
cols = [(t.x0, t.x1) for r in rows_grouped if len(r) == ncols for t in r]
|
cols = [(t.x0, t.x1) for r in rows_grouped if len(r) == ncols for t in r]
|
||||||
cols = self._merge_columns(sorted(cols), column_tol=self.column_tol)
|
cols = self._merge_columns(sorted(cols), column_tol=self.column_tol)
|
||||||
|
|
@ -433,19 +432,19 @@ class Stream(BaseParser):
|
||||||
|
|
||||||
def extract_tables(self, filename, suppress_stdout=False, layout_kwargs={}):
|
def extract_tables(self, filename, suppress_stdout=False, layout_kwargs={}):
|
||||||
self._generate_layout(filename, layout_kwargs)
|
self._generate_layout(filename, layout_kwargs)
|
||||||
|
base_filename = os.path.basename(self.rootname)
|
||||||
|
|
||||||
if not suppress_stdout:
|
if not suppress_stdout:
|
||||||
logger.info("Processing {}".format(os.path.basename(self.rootname)))
|
logger.info(f"Processing {base_filename}")
|
||||||
|
|
||||||
if not self.horizontal_text:
|
if not self.horizontal_text:
|
||||||
if self.images:
|
if self.images:
|
||||||
warnings.warn(
|
warnings.warn(
|
||||||
"{} is image-based, camelot only works on"
|
f"{base_filename} is image-based, camelot only works on"
|
||||||
" text-based pages.".format(os.path.basename(self.rootname))
|
" text-based pages."
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
warnings.warn(
|
warnings.warn(f"No tables found on {base_filename}")
|
||||||
"No tables found on {}".format(os.path.basename(self.rootname))
|
|
||||||
)
|
|
||||||
return []
|
return []
|
||||||
|
|
||||||
self._generate_table_bbox()
|
self._generate_table_bbox()
|
||||||
|
|
|
||||||
|
|
@ -35,11 +35,11 @@ class PlotMethods(object):
|
||||||
|
|
||||||
if table.flavor == "lattice" and kind in ["textedge"]:
|
if table.flavor == "lattice" and kind in ["textedge"]:
|
||||||
raise NotImplementedError(
|
raise NotImplementedError(
|
||||||
"Lattice flavor does not support kind='{}'".format(kind)
|
f"Lattice flavor does not support kind='{kind}'"
|
||||||
)
|
)
|
||||||
elif table.flavor == "stream" and kind in ["joint", "line"]:
|
elif table.flavor == "stream" and kind in ["joint", "line"]:
|
||||||
raise NotImplementedError(
|
raise NotImplementedError(
|
||||||
"Stream flavor does not support kind='{}'".format(kind)
|
f"Stream flavor does not support kind='{kind}'"
|
||||||
)
|
)
|
||||||
|
|
||||||
plot_method = getattr(self, kind)
|
plot_method = getattr(self, kind)
|
||||||
|
|
|
||||||
|
|
@ -1,9 +1,7 @@
|
||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
from __future__ import division
|
|
||||||
|
|
||||||
import re
|
|
||||||
import os
|
import os
|
||||||
import sys
|
import re
|
||||||
import random
|
import random
|
||||||
import shutil
|
import shutil
|
||||||
import string
|
import string
|
||||||
|
|
@ -29,16 +27,9 @@ from pdfminer.layout import (
|
||||||
LTImage,
|
LTImage,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
from urllib.request import Request, urlopen
|
||||||
PY3 = sys.version_info[0] >= 3
|
from urllib.parse import urlparse as parse_url
|
||||||
if PY3:
|
from urllib.parse import uses_relative, uses_netloc, uses_params
|
||||||
from urllib.request import urlopen
|
|
||||||
from urllib.parse import urlparse as parse_url
|
|
||||||
from urllib.parse import uses_relative, uses_netloc, uses_params
|
|
||||||
else:
|
|
||||||
from urllib2 import urlopen
|
|
||||||
from urlparse import urlparse as parse_url
|
|
||||||
from urlparse import uses_relative, uses_netloc, uses_params
|
|
||||||
|
|
||||||
|
|
||||||
_VALID_URLS = set(uses_relative + uses_netloc + uses_params)
|
_VALID_URLS = set(uses_relative + uses_netloc + uses_params)
|
||||||
|
|
@ -88,13 +79,12 @@ def download_url(url):
|
||||||
Temporary filepath.
|
Temporary filepath.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
filename = "{}.pdf".format(random_string(6))
|
filename = f"{random_string(6)}.pdf"
|
||||||
with tempfile.NamedTemporaryFile("wb", delete=False) as f:
|
with tempfile.NamedTemporaryFile("wb", delete=False) as f:
|
||||||
obj = urlopen(url)
|
headers = {"User-Agent": "Mozilla/5.0"}
|
||||||
if PY3:
|
request = Request(url, None, headers)
|
||||||
|
obj = urlopen(request)
|
||||||
content_type = obj.info().get_content_type()
|
content_type = obj.info().get_content_type()
|
||||||
else:
|
|
||||||
content_type = obj.info().getheader("Content-Type")
|
|
||||||
if content_type != "application/pdf":
|
if content_type != "application/pdf":
|
||||||
raise NotImplementedError("File format not supported")
|
raise NotImplementedError("File format not supported")
|
||||||
f.write(obj.read())
|
f.write(obj.read())
|
||||||
|
|
@ -123,9 +113,7 @@ def validate_input(kwargs, flavor="lattice"):
|
||||||
isec = set(parser_kwargs).intersection(set(input_kwargs.keys()))
|
isec = set(parser_kwargs).intersection(set(input_kwargs.keys()))
|
||||||
if isec:
|
if isec:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
"{} cannot be used with flavor='{}'".format(
|
f"{','.join(sorted(isec))} cannot be used with flavor='{flavor}'"
|
||||||
",".join(sorted(isec)), flavor
|
|
||||||
)
|
|
||||||
)
|
)
|
||||||
|
|
||||||
if flavor == "lattice":
|
if flavor == "lattice":
|
||||||
|
|
@ -423,7 +411,7 @@ def text_strip(text, strip=""):
|
||||||
return text
|
return text
|
||||||
|
|
||||||
stripped = re.sub(
|
stripped = re.sub(
|
||||||
r"[{}]".format("".join(map(re.escape, strip))), "", text, re.UNICODE
|
fr"[{''.join(map(re.escape, strip))}]", "", text, re.UNICODE
|
||||||
)
|
)
|
||||||
return stripped
|
return stripped
|
||||||
|
|
||||||
|
|
@ -660,9 +648,7 @@ def get_table_index(
|
||||||
text_range = (t.x0, t.x1)
|
text_range = (t.x0, t.x1)
|
||||||
col_range = (table.cols[0][0], table.cols[-1][1])
|
col_range = (table.cols[0][0], table.cols[-1][1])
|
||||||
warnings.warn(
|
warnings.warn(
|
||||||
"{} {} does not lie in column range {}".format(
|
f"{text} {text_range} does not lie in column range {col_range}"
|
||||||
text, text_range, col_range
|
|
||||||
)
|
|
||||||
)
|
)
|
||||||
r_idx = r
|
r_idx = r
|
||||||
c_idx = lt_col_overlap.index(max(lt_col_overlap))
|
c_idx = lt_col_overlap.index(max(lt_col_overlap))
|
||||||
|
|
|
||||||
4
setup.py
|
|
@ -71,10 +71,10 @@ def setup_package():
|
||||||
# Trove classifiers
|
# Trove classifiers
|
||||||
# Full list: https://pypi.python.org/pypi?%3Aaction=list_classifiers
|
# Full list: https://pypi.python.org/pypi?%3Aaction=list_classifiers
|
||||||
'License :: OSI Approved :: MIT License',
|
'License :: OSI Approved :: MIT License',
|
||||||
'Programming Language :: Python :: 2.7',
|
|
||||||
'Programming Language :: Python :: 3.5',
|
'Programming Language :: Python :: 3.5',
|
||||||
'Programming Language :: Python :: 3.6',
|
'Programming Language :: Python :: 3.6',
|
||||||
'Programming Language :: Python :: 3.7'
|
'Programming Language :: Python :: 3.7',
|
||||||
|
'Programming Language :: Python :: 3.8'
|
||||||
])
|
])
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
|
|
||||||
|
|
@ -1,2 +1,3 @@
|
||||||
import matplotlib
|
import matplotlib
|
||||||
matplotlib.use('agg')
|
|
||||||
|
matplotlib.use("agg")
|
||||||
|
|
|
||||||
|
|
@ -1,19 +1,7 @@
|
||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
|
|
||||||
data_stream = [
|
data_stream = [
|
||||||
[
|
|
||||||
"",
|
|
||||||
"Table: 5 Public Health Outlay 2012-13 (Budget Estimates) (Rs. in 000)",
|
|
||||||
"",
|
|
||||||
"",
|
|
||||||
"",
|
|
||||||
"",
|
|
||||||
"",
|
|
||||||
"",
|
|
||||||
],
|
|
||||||
["States-A", "Revenue", "", "Capital", "", "Total", "Others(1)", "Total"],
|
["States-A", "Revenue", "", "Capital", "", "Total", "Others(1)", "Total"],
|
||||||
["", "", "", "", "", "Revenue &", "", ""],
|
["", "", "", "", "", "Revenue &", "", ""],
|
||||||
["", "Medical &", "Family", "Medical &", "Family", "", "", ""],
|
["", "Medical &", "Family", "Medical &", "Family", "", "", ""],
|
||||||
|
|
@ -829,18 +817,6 @@ data_stream_table_rotated = [
|
||||||
]
|
]
|
||||||
|
|
||||||
data_stream_two_tables_1 = [
|
data_stream_two_tables_1 = [
|
||||||
[
|
|
||||||
"[In thousands (11,062.6 represents 11,062,600) For year ending December 31. Based on Uniform Crime Reporting (UCR)",
|
|
||||||
"",
|
|
||||||
"",
|
|
||||||
"",
|
|
||||||
"",
|
|
||||||
"",
|
|
||||||
"",
|
|
||||||
"",
|
|
||||||
"",
|
|
||||||
"",
|
|
||||||
],
|
|
||||||
[
|
[
|
||||||
"Program. Represents arrests reported (not charged) by 12,910 agencies with a total population of 247,526,916 as estimated",
|
"Program. Represents arrests reported (not charged) by 12,910 agencies with a total population of 247,526,916 as estimated",
|
||||||
"",
|
"",
|
||||||
|
|
@ -1300,29 +1276,10 @@ data_stream_two_tables_1 = [
|
||||||
"",
|
"",
|
||||||
"",
|
"",
|
||||||
],
|
],
|
||||||
[
|
|
||||||
"",
|
|
||||||
"Source: U.S. Department of Justice, Federal Bureau of Investigation, Uniform Crime Reports, Arrests Master Files.",
|
|
||||||
"",
|
|
||||||
"",
|
|
||||||
"",
|
|
||||||
"",
|
|
||||||
"",
|
|
||||||
"",
|
|
||||||
"",
|
|
||||||
"",
|
|
||||||
],
|
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
data_stream_two_tables_2 = [
|
data_stream_two_tables_2 = [
|
||||||
[
|
|
||||||
"",
|
|
||||||
"Source: U.S. Department of Justice, Federal Bureau of Investigation, Uniform Crime Reports, Arrests Master Files.",
|
|
||||||
"",
|
|
||||||
"",
|
|
||||||
"",
|
|
||||||
"",
|
|
||||||
],
|
|
||||||
["Table 325. Arrests by Race: 2009", "", "", "", "", ""],
|
["Table 325. Arrests by Race: 2009", "", "", "", "", ""],
|
||||||
[
|
[
|
||||||
"[Based on Uniform Crime Reporting (UCR) Program. Represents arrests reported (not charged) by 12,371 agencies",
|
"[Based on Uniform Crime Reporting (UCR) Program. Represents arrests reported (not charged) by 12,371 agencies",
|
||||||
|
|
@ -1600,16 +1557,9 @@ data_stream_two_tables_2 = [
|
||||||
"3,950",
|
"3,950",
|
||||||
],
|
],
|
||||||
["1 Except forcible rape and prostitution.", "", "", "", "", ""],
|
["1 Except forcible rape and prostitution.", "", "", "", "", ""],
|
||||||
[
|
|
||||||
"",
|
|
||||||
"Source: U.S. Department of Justice, Federal Bureau of Investigation, “Crime in the United States, Arrests,” September 2010,",
|
|
||||||
"",
|
|
||||||
"",
|
|
||||||
"",
|
|
||||||
"",
|
|
||||||
],
|
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
data_stream_table_areas = [
|
data_stream_table_areas = [
|
||||||
["", "One Withholding"],
|
["", "One Withholding"],
|
||||||
["Payroll Period", "Allowance"],
|
["Payroll Period", "Allowance"],
|
||||||
|
|
@ -1776,18 +1726,7 @@ data_stream_columns = [
|
||||||
]
|
]
|
||||||
|
|
||||||
data_stream_split_text = [
|
data_stream_split_text = [
|
||||||
[
|
["FEB", "RUAR", "Y 2014 M27 (BUS)", "", "", "", "", "", "", ""],
|
||||||
"FEB",
|
|
||||||
"RUAR",
|
|
||||||
"Y 2014 M27 (BUS)",
|
|
||||||
"",
|
|
||||||
"ALPHABETIC LISTING BY T",
|
|
||||||
"YPE",
|
|
||||||
"",
|
|
||||||
"",
|
|
||||||
"",
|
|
||||||
"ABLPDM27",
|
|
||||||
],
|
|
||||||
["", "", "", "", "OF ACTIVE LICENSES", "", "", "", "", "3/19/2014"],
|
["", "", "", "", "OF ACTIVE LICENSES", "", "", "", "", "3/19/2014"],
|
||||||
["", "", "", "", "OKLAHOMA ABLE COMMIS", "SION", "", "", "", ""],
|
["", "", "", "", "OKLAHOMA ABLE COMMIS", "SION", "", "", "", ""],
|
||||||
["LICENSE", "", "", "", "PREMISE", "", "", "", "", ""],
|
["LICENSE", "", "", "", "PREMISE", "", "", "", "", ""],
|
||||||
|
|
@ -2121,6 +2060,7 @@ data_stream_split_text = [
|
||||||
],
|
],
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
data_stream_flag_size = [
|
data_stream_flag_size = [
|
||||||
[
|
[
|
||||||
"States",
|
"States",
|
||||||
|
|
@ -2820,7 +2760,7 @@ data_arabic = [
|
||||||
]
|
]
|
||||||
|
|
||||||
data_stream_layout_kwargs = [
|
data_stream_layout_kwargs = [
|
||||||
["V i n s a u Ve r r e", ""],
|
["V i n s a u V e r r e", ""],
|
||||||
["Les Blancs", "12.5CL"],
|
["Les Blancs", "12.5CL"],
|
||||||
["A.O.P Côtes du Rhône", ""],
|
["A.O.P Côtes du Rhône", ""],
|
||||||
["Domaine de la Guicharde « Autour de la chapelle » 2016", "8 €"],
|
["Domaine de la Guicharde « Autour de la chapelle » 2016", "8 €"],
|
||||||
|
|
|
||||||
|
Before Width: | Height: | Size: 48 KiB After Width: | Height: | Size: 33 KiB |
|
Before Width: | Height: | Size: 6.7 KiB After Width: | Height: | Size: 6.7 KiB |
|
Before Width: | Height: | Size: 13 KiB After Width: | Height: | Size: 14 KiB |
|
Before Width: | Height: | Size: 8.8 KiB After Width: | Height: | Size: 8.9 KiB |
|
Before Width: | Height: | Size: 18 KiB After Width: | Height: | Size: 19 KiB |
|
|
@ -114,31 +114,35 @@ def test_cli_password():
|
||||||
def test_cli_output_format():
|
def test_cli_output_format():
|
||||||
with TemporaryDirectory() as tempdir:
|
with TemporaryDirectory() as tempdir:
|
||||||
infile = os.path.join(testdir, "health.pdf")
|
infile = os.path.join(testdir, "health.pdf")
|
||||||
outfile = os.path.join(tempdir, "health.{}")
|
|
||||||
runner = CliRunner()
|
runner = CliRunner()
|
||||||
|
|
||||||
# json
|
# json
|
||||||
|
outfile = os.path.join(tempdir, "health.json")
|
||||||
result = runner.invoke(
|
result = runner.invoke(
|
||||||
cli,
|
cli,
|
||||||
["--format", "json", "--output", outfile.format("json"), "stream", infile],
|
["--format", "json", "--output", outfile, "stream", infile],
|
||||||
)
|
)
|
||||||
assert result.exit_code == 0
|
assert result.exit_code == 0
|
||||||
|
|
||||||
# excel
|
# excel
|
||||||
|
outfile = os.path.join(tempdir, "health.xlsx")
|
||||||
result = runner.invoke(
|
result = runner.invoke(
|
||||||
cli,
|
cli,
|
||||||
["--format", "excel", "--output", outfile.format("xlsx"), "stream", infile],
|
["--format", "excel", "--output", outfile, "stream", infile],
|
||||||
)
|
)
|
||||||
assert result.exit_code == 0
|
assert result.exit_code == 0
|
||||||
|
|
||||||
# html
|
# html
|
||||||
|
outfile = os.path.join(tempdir, "health.html")
|
||||||
result = runner.invoke(
|
result = runner.invoke(
|
||||||
cli,
|
cli,
|
||||||
["--format", "html", "--output", outfile.format("html"), "stream", infile],
|
["--format", "html", "--output", outfile, "stream", infile],
|
||||||
)
|
)
|
||||||
assert result.exit_code == 0
|
assert result.exit_code == 0
|
||||||
|
|
||||||
# zip
|
# zip
|
||||||
|
outfile = os.path.join(tempdir, "health.csv")
|
||||||
result = runner.invoke(
|
result = runner.invoke(
|
||||||
cli,
|
cli,
|
||||||
[
|
[
|
||||||
|
|
@ -146,7 +150,7 @@ def test_cli_output_format():
|
||||||
"--format",
|
"--format",
|
||||||
"csv",
|
"csv",
|
||||||
"--output",
|
"--output",
|
||||||
outfile.format("csv"),
|
outfile,
|
||||||
"stream",
|
"stream",
|
||||||
infile,
|
infile,
|
||||||
],
|
],
|
||||||
|
|
|
||||||
|
|
@ -10,88 +10,93 @@ import camelot
|
||||||
|
|
||||||
testdir = os.path.dirname(os.path.abspath(__file__))
|
testdir = os.path.dirname(os.path.abspath(__file__))
|
||||||
testdir = os.path.join(testdir, "files")
|
testdir = os.path.join(testdir, "files")
|
||||||
filename = os.path.join(testdir, 'foo.pdf')
|
filename = os.path.join(testdir, "foo.pdf")
|
||||||
|
|
||||||
|
|
||||||
def test_unknown_flavor():
|
def test_unknown_flavor():
|
||||||
message = ("Unknown flavor specified."
|
message = "Unknown flavor specified." " Use either 'lattice' or 'stream'"
|
||||||
" Use either 'lattice' or 'stream'")
|
|
||||||
with pytest.raises(NotImplementedError, match=message):
|
with pytest.raises(NotImplementedError, match=message):
|
||||||
tables = camelot.read_pdf(filename, flavor='chocolate')
|
tables = camelot.read_pdf(filename, flavor="chocolate")
|
||||||
|
|
||||||
|
|
||||||
def test_input_kwargs():
|
def test_input_kwargs():
|
||||||
message = "columns cannot be used with flavor='lattice'"
|
message = "columns cannot be used with flavor='lattice'"
|
||||||
with pytest.raises(ValueError, match=message):
|
with pytest.raises(ValueError, match=message):
|
||||||
tables = camelot.read_pdf(filename, columns=['10,20,30,40'])
|
tables = camelot.read_pdf(filename, columns=["10,20,30,40"])
|
||||||
|
|
||||||
|
|
||||||
def test_unsupported_format():
|
def test_unsupported_format():
|
||||||
message = 'File format not supported'
|
message = "File format not supported"
|
||||||
filename = os.path.join(testdir, 'foo.csv')
|
filename = os.path.join(testdir, "foo.csv")
|
||||||
with pytest.raises(NotImplementedError, match=message):
|
with pytest.raises(NotImplementedError, match=message):
|
||||||
tables = camelot.read_pdf(filename)
|
tables = camelot.read_pdf(filename)
|
||||||
|
|
||||||
|
|
||||||
def test_stream_equal_length():
|
def test_stream_equal_length():
|
||||||
message = ("Length of table_areas and columns"
|
message = "Length of table_areas and columns" " should be equal"
|
||||||
" should be equal")
|
|
||||||
with pytest.raises(ValueError, match=message):
|
with pytest.raises(ValueError, match=message):
|
||||||
tables = camelot.read_pdf(filename, flavor='stream',
|
tables = camelot.read_pdf(
|
||||||
table_areas=['10,20,30,40'], columns=['10,20,30,40', '10,20,30,40'])
|
filename,
|
||||||
|
flavor="stream",
|
||||||
|
table_areas=["10,20,30,40"],
|
||||||
|
columns=["10,20,30,40", "10,20,30,40"],
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def test_image_warning():
|
def test_image_warning():
|
||||||
filename = os.path.join(testdir, 'image.pdf')
|
filename = os.path.join(testdir, "image.pdf")
|
||||||
with warnings.catch_warnings():
|
with warnings.catch_warnings():
|
||||||
warnings.simplefilter('error')
|
warnings.simplefilter("error")
|
||||||
with pytest.raises(UserWarning) as e:
|
with pytest.raises(UserWarning) as e:
|
||||||
tables = camelot.read_pdf(filename)
|
tables = camelot.read_pdf(filename)
|
||||||
assert str(e.value) == 'page-1 is image-based, camelot only works on text-based pages.'
|
assert (
|
||||||
|
str(e.value)
|
||||||
|
== "page-1 is image-based, camelot only works on text-based pages."
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def test_no_tables_found():
|
def test_no_tables_found():
|
||||||
filename = os.path.join(testdir, 'blank.pdf')
|
filename = os.path.join(testdir, "blank.pdf")
|
||||||
with warnings.catch_warnings():
|
with warnings.catch_warnings():
|
||||||
warnings.simplefilter('error')
|
warnings.simplefilter("error")
|
||||||
with pytest.raises(UserWarning) as e:
|
with pytest.raises(UserWarning) as e:
|
||||||
tables = camelot.read_pdf(filename)
|
tables = camelot.read_pdf(filename)
|
||||||
assert str(e.value) == 'No tables found on page-1'
|
assert str(e.value) == "No tables found on page-1"
|
||||||
|
|
||||||
|
|
||||||
def test_no_tables_found_logs_suppressed():
|
def test_no_tables_found_logs_suppressed():
|
||||||
filename = os.path.join(testdir, 'foo.pdf')
|
filename = os.path.join(testdir, "foo.pdf")
|
||||||
with warnings.catch_warnings():
|
with warnings.catch_warnings():
|
||||||
# the test should fail if any warning is thrown
|
# the test should fail if any warning is thrown
|
||||||
warnings.simplefilter('error')
|
warnings.simplefilter("error")
|
||||||
try:
|
try:
|
||||||
tables = camelot.read_pdf(filename, suppress_stdout=True)
|
tables = camelot.read_pdf(filename, suppress_stdout=True)
|
||||||
except Warning as e:
|
except Warning as e:
|
||||||
warning_text = str(e)
|
warning_text = str(e)
|
||||||
pytest.fail('Unexpected warning: {}'.format(warning_text))
|
pytest.fail(f"Unexpected warning: {warning_text}")
|
||||||
|
|
||||||
|
|
||||||
def test_no_tables_found_warnings_suppressed():
|
def test_no_tables_found_warnings_suppressed():
|
||||||
filename = os.path.join(testdir, 'blank.pdf')
|
filename = os.path.join(testdir, "blank.pdf")
|
||||||
with warnings.catch_warnings():
|
with warnings.catch_warnings():
|
||||||
# the test should fail if any warning is thrown
|
# the test should fail if any warning is thrown
|
||||||
warnings.simplefilter('error')
|
warnings.simplefilter("error")
|
||||||
try:
|
try:
|
||||||
tables = camelot.read_pdf(filename, suppress_stdout=True)
|
tables = camelot.read_pdf(filename, suppress_stdout=True)
|
||||||
except Warning as e:
|
except Warning as e:
|
||||||
warning_text = str(e)
|
warning_text = str(e)
|
||||||
pytest.fail('Unexpected warning: {}'.format(warning_text))
|
pytest.fail(f"Unexpected warning: {warning_text}")
|
||||||
|
|
||||||
|
|
||||||
def test_no_password():
|
def test_no_password():
|
||||||
filename = os.path.join(testdir, 'health_protected.pdf')
|
filename = os.path.join(testdir, "health_protected.pdf")
|
||||||
message = 'file has not been decrypted'
|
message = "file has not been decrypted"
|
||||||
with pytest.raises(Exception, match=message):
|
with pytest.raises(Exception, match=message):
|
||||||
tables = camelot.read_pdf(filename)
|
tables = camelot.read_pdf(filename)
|
||||||
|
|
||||||
|
|
||||||
def test_bad_password():
|
def test_bad_password():
|
||||||
filename = os.path.join(testdir, 'health_protected.pdf')
|
filename = os.path.join(testdir, "health_protected.pdf")
|
||||||
message = 'file has not been decrypted'
|
message = "file has not been decrypted"
|
||||||
with pytest.raises(Exception, match=message):
|
with pytest.raises(Exception, match=message):
|
||||||
tables = camelot.read_pdf(filename, password='wrongpass')
|
tables = camelot.read_pdf(filename, password="wrongpass")
|
||||||
|
|
|
||||||
|
|
@ -11,57 +11,50 @@ testdir = os.path.dirname(os.path.abspath(__file__))
|
||||||
testdir = os.path.join(testdir, "files")
|
testdir = os.path.join(testdir, "files")
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.mpl_image_compare(
|
@pytest.mark.mpl_image_compare(baseline_dir="files/baseline_plots", remove_text=True)
|
||||||
baseline_dir="files/baseline_plots", remove_text=True)
|
|
||||||
def test_text_plot():
|
def test_text_plot():
|
||||||
filename = os.path.join(testdir, "foo.pdf")
|
filename = os.path.join(testdir, "foo.pdf")
|
||||||
tables = camelot.read_pdf(filename)
|
tables = camelot.read_pdf(filename)
|
||||||
return camelot.plot(tables[0], kind='text')
|
return camelot.plot(tables[0], kind="text")
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.mpl_image_compare(
|
@pytest.mark.mpl_image_compare(baseline_dir="files/baseline_plots", remove_text=True)
|
||||||
baseline_dir="files/baseline_plots", remove_text=True)
|
|
||||||
def test_grid_plot():
|
def test_grid_plot():
|
||||||
filename = os.path.join(testdir, "foo.pdf")
|
filename = os.path.join(testdir, "foo.pdf")
|
||||||
tables = camelot.read_pdf(filename)
|
tables = camelot.read_pdf(filename)
|
||||||
return camelot.plot(tables[0], kind='grid')
|
return camelot.plot(tables[0], kind="grid")
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.mpl_image_compare(
|
@pytest.mark.mpl_image_compare(baseline_dir="files/baseline_plots", remove_text=True)
|
||||||
baseline_dir="files/baseline_plots", remove_text=True)
|
|
||||||
def test_lattice_contour_plot():
|
def test_lattice_contour_plot():
|
||||||
filename = os.path.join(testdir, "foo.pdf")
|
filename = os.path.join(testdir, "foo.pdf")
|
||||||
tables = camelot.read_pdf(filename)
|
tables = camelot.read_pdf(filename)
|
||||||
return camelot.plot(tables[0], kind='contour')
|
return camelot.plot(tables[0], kind="contour")
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.mpl_image_compare(
|
@pytest.mark.mpl_image_compare(baseline_dir="files/baseline_plots", remove_text=True)
|
||||||
baseline_dir="files/baseline_plots", remove_text=True)
|
|
||||||
def test_stream_contour_plot():
|
def test_stream_contour_plot():
|
||||||
filename = os.path.join(testdir, "tabula/12s0324.pdf")
|
filename = os.path.join(testdir, "tabula/12s0324.pdf")
|
||||||
tables = camelot.read_pdf(filename, flavor='stream')
|
tables = camelot.read_pdf(filename, flavor="stream")
|
||||||
return camelot.plot(tables[0], kind='contour')
|
return camelot.plot(tables[0], kind="contour")
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.mpl_image_compare(
|
@pytest.mark.mpl_image_compare(baseline_dir="files/baseline_plots", remove_text=True)
|
||||||
baseline_dir="files/baseline_plots", remove_text=True)
|
|
||||||
def test_line_plot():
|
def test_line_plot():
|
||||||
filename = os.path.join(testdir, "foo.pdf")
|
filename = os.path.join(testdir, "foo.pdf")
|
||||||
tables = camelot.read_pdf(filename)
|
tables = camelot.read_pdf(filename)
|
||||||
return camelot.plot(tables[0], kind='line')
|
return camelot.plot(tables[0], kind="line")
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.mpl_image_compare(
|
@pytest.mark.mpl_image_compare(baseline_dir="files/baseline_plots", remove_text=True)
|
||||||
baseline_dir="files/baseline_plots", remove_text=True)
|
|
||||||
def test_joint_plot():
|
def test_joint_plot():
|
||||||
filename = os.path.join(testdir, "foo.pdf")
|
filename = os.path.join(testdir, "foo.pdf")
|
||||||
tables = camelot.read_pdf(filename)
|
tables = camelot.read_pdf(filename)
|
||||||
return camelot.plot(tables[0], kind='joint')
|
return camelot.plot(tables[0], kind="joint")
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.mpl_image_compare(
|
@pytest.mark.mpl_image_compare(baseline_dir="files/baseline_plots", remove_text=True)
|
||||||
baseline_dir="files/baseline_plots", remove_text=True)
|
|
||||||
def test_textedge_plot():
|
def test_textedge_plot():
|
||||||
filename = os.path.join(testdir, "tabula/12s0324.pdf")
|
filename = os.path.join(testdir, "tabula/12s0324.pdf")
|
||||||
tables = camelot.read_pdf(filename, flavor='stream')
|
tables = camelot.read_pdf(filename, flavor="stream")
|
||||||
return camelot.plot(tables[0], kind='textedge')
|
return camelot.plot(tables[0], kind="textedge")
|
||||||
|
|
|
||||||