diff --git a/.github/FUNDING.yml b/.github/FUNDING.yml new file mode 100644 index 0000000..82ea413 --- /dev/null +++ b/.github/FUNDING.yml @@ -0,0 +1 @@ +open_collective: camelot diff --git a/.readthedocs.yml b/.readthedocs.yml new file mode 100644 index 0000000..d174194 --- /dev/null +++ b/.readthedocs.yml @@ -0,0 +1,24 @@ +# .readthedocs.yml +# Read the Docs configuration file +# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details + +# Required +version: 2 + +# Build documentation in the docs/ directory with Sphinx +sphinx: + configuration: docs/conf.py + +# Build documentation with MkDocs +#mkdocs: +# configuration: mkdocs.yml + +# Optionally build your docs in additional formats such as PDF +formats: + - pdf + +# Optionally set the version of Python and requirements required to build your docs +python: + version: 3.8 + install: + - requirements: requirements.txt diff --git a/.travis.yml b/.travis.yml index c603fd5..d791413 100755 --- a/.travis.yml +++ b/.travis.yml @@ -7,10 +7,6 @@ install: - make install jobs: include: - - stage: test - script: - - make test - python: '3.5' - stage: test script: - make test @@ -20,8 +16,13 @@ jobs: - make test python: '3.7' dist: xenial + - stage: test + script: + - make test + python: '3.8' + dist: xenial - stage: coverage - python: '3.6' + python: '3.8' script: - make test - codecov --verbose diff --git a/HISTORY.md b/HISTORY.md index 343463b..879b77b 100755 --- a/HISTORY.md +++ b/HISTORY.md @@ -4,6 +4,18 @@ Release History master ------ +0.8.0 (2020-05-24) +------------------ + +**Improvements** + +* Drop Python 2 support! + * Remove Python 2.7 and 3.5 support. + * Replace all instances of `.format` with f-strings. + * Remove all `__future__` imports. + * Fix HTTP 403 forbidden exception in read_pdf(url) and remove Python 2 urllib support. + * Fix test data. + **Bugfixes** * Fix library discovery on Windows. [#32](https://github.com/camelot-dev/camelot/pull/32) by [KOLANICH](https://github.com/KOLANICH). diff --git a/camelot/__main__.py b/camelot/__main__.py index 93040c6..a0b82a6 100755 --- a/camelot/__main__.py +++ b/camelot/__main__.py @@ -1,7 +1,5 @@ # -*- coding: utf-8 -*- -from __future__ import absolute_import - __all__ = ("main",) diff --git a/camelot/__version__.py b/camelot/__version__.py index f4e1005..1eb9ce8 100644 --- a/camelot/__version__.py +++ b/camelot/__version__.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -VERSION = (0, 7, 3) +VERSION = (0, 8, 0) PRERELEASE = None # alpha, beta or rc REVISION = None @@ -8,9 +8,9 @@ REVISION = None def generate_version(version, prerelease=None, revision=None): version_parts = [".".join(map(str, version))] if prerelease is not None: - version_parts.append("-{}".format(prerelease)) + version_parts.append(f"-{prerelease}") if revision is not None: - version_parts.append(".{}".format(revision)) + version_parts.append(f".{revision}") return "".join(version_parts) diff --git a/camelot/cli.py b/camelot/cli.py index 86f5db8..e3ac86c 100644 --- a/camelot/cli.py +++ b/camelot/cli.py @@ -210,7 +210,7 @@ def lattice(c, *args, **kwargs): filepath, pages=pages, flavor="lattice", suppress_stdout=quiet, **kwargs ) - click.echo("Found {} tables".format(tables.n)) + click.echo(f"Found {tables.n} tables") if plot_type is not None: for table in tables: plot(table, kind=plot_type) @@ -304,7 +304,7 @@ def stream(c, *args, **kwargs): tables = read_pdf( filepath, pages=pages, flavor="stream", suppress_stdout=quiet, **kwargs ) - click.echo("Found {} tables".format(tables.n)) + click.echo(f"Found {tables.n} tables") if plot_type is not None: for table in tables: plot(table, kind=plot_type) @@ -399,7 +399,7 @@ def network(c, *args, **kwargs): filepath, pages=pages, flavor="network", suppress_stdout=quiet, **kwargs ) - click.echo("Found {} tables".format(tables.n)) + click.echo(f"Found {tables.n} tables") if plot_type is not None: for table in tables: plot(table, kind=plot_type) diff --git a/camelot/core.py b/camelot/core.py index 00591c3..2916531 100644 --- a/camelot/core.py +++ b/camelot/core.py @@ -68,12 +68,8 @@ class TextAlignment(): def __repr__(self): text_inside = " | ".join( map(lambda x: x.get_text(), self.textlines[:2])).replace("\n", "") - return "".format( - coord=self.coord, - tl_count=len(self.textlines), - text_inside=text_inside - ) + return f"" def register_aligned_textline(self, textline, coord): """Update new textline to this alignment, adapting its average.""" @@ -116,13 +112,10 @@ class TextEdge(TextAlignment): self.is_valid = False def __repr__(self): - return "".format( - round(self.coord, 2), - round(self.y0, 2), - round(self.y1, 2), - self.align, - self.is_valid, - ) + x = round(self.x, 2) + y0 = round(self.y0, 2) + y1 = round(self.y1, 2) + return f"" def update_coords(self, x, textline, edge_tol=50): """Updates the text edge's x and bottom y coordinates and sets @@ -386,12 +379,11 @@ class Cell(): self._text = "" def __repr__(self): - return "".format( - round(self.x1, 2), - round(self.y1, 2), - round(self.x2, 2), - round(self.y2, 2) - ) + x1 = round(self.x1, 2) + y1 = round(self.y1, 2) + x2 = round(self.x2, 2) + y2 = round(self.y2, 2) + return f"" @property def text(self): @@ -465,7 +457,7 @@ class Table(): self.textlines = [] # List of actual textlines on the page def __repr__(self): - return "<{} shape={}>".format(self.__class__.__name__, self.shape) + return f"<{self.__class__.__name__} shape={self.shape}>" def __lt__(self, other): if self.page == other.page: @@ -739,7 +731,7 @@ class Table(): """ kw = { - "sheet_name": "page-{}-table-{}".format(self.page, self.order), + "sheet_name": f"page-{self.page}-table-{self.order}", "encoding": "utf-8", } kw.update(kwargs) @@ -777,7 +769,7 @@ class Table(): kw = {"if_exists": "replace", "index": False} kw.update(kwargs) conn = sqlite3.connect(path) - table_name = "page-{}-table-{}".format(self.page, self.order) + table_name = f"page-{self.page}-table-{self.order}" self.df.to_sql(table_name, conn, **kw) conn.commit() conn.close() @@ -831,7 +823,7 @@ class TableList(): self._tables = tables def __repr__(self): - return "<{} n={}>".format(self.__class__.__name__, self.n) + return f"<{self.__class__.__name__} n={self.n}>" def __len__(self): return len(self._tables) @@ -841,7 +833,7 @@ class TableList(): @staticmethod def _format_func(table, f): - return getattr(table, "to_{}".format(f)) + return getattr(table, f"to_{f}") @property def n(self): @@ -852,10 +844,7 @@ class TableList(): root = kwargs.get("root") ext = kwargs.get("ext") for table in self._tables: - filename = os.path.join( - "{}-page-{}-table-{}{}".format(root, table.page, table.order, - ext) - ) + filename = f"{root}-page-{table.page}-table-{table.order}{ext}" filepath = os.path.join(dirname, filename) to_format = self._format_func(table, f) to_format(filepath) @@ -868,12 +857,7 @@ class TableList(): zipname = os.path.join(os.path.dirname(path), root) + ".zip" with zipfile.ZipFile(zipname, "w", allowZip64=True) as z: for table in self._tables: - filename = os.path.join( - "{}-page-{}-table-{}{}".format(root, - table.page, - table.order, - ext) - ) + filename = f"{root}-page-{table.page}-table-{table.order}{ext}" filepath = os.path.join(dirname, filename) z.write(filepath, os.path.basename(filepath)) @@ -907,9 +891,8 @@ class TableList(): # pylint: disable=abstract-class-instantiated writer = pd.ExcelWriter(filepath) for table in self._tables: - sheet_name = "page-{}-table-{}".format(table.page, table.order) - table.df.to_excel(writer, sheet_name=sheet_name, - encoding="utf-8") + sheet_name = f"page-{table.page}-table-{table.order}" + table.df.to_excel(writer, sheet_name=sheet_name, encoding="utf-8") writer.save() if compress: zipname = os.path.join(os.path.dirname(path), root) + ".zip" diff --git a/camelot/ext/ghostscript/_gsprint.py b/camelot/ext/ghostscript/_gsprint.py index b31b768..9896805 100644 --- a/camelot/ext/ghostscript/_gsprint.py +++ b/camelot/ext/ghostscript/_gsprint.py @@ -81,6 +81,7 @@ def delete_instance(instance): """ return libgs.gsapi_delete_instance(instance) + if sys.platform == "win32": c_stdstream_call_t = WINFUNCTYPE(c_int, gs_main_instance, POINTER(c_char), c_int) else: @@ -247,7 +248,10 @@ if sys.platform == "win32": libgs = __win32_finddll() if not libgs: import ctypes.util - libgs = ctypes.util.find_library("".join(("gsdll", str(ctypes.sizeof(ctypes.c_voidp) * 8), ".dll"))) # finds in %PATH% + + libgs = ctypes.util.find_library( + "".join(("gsdll", str(ctypes.sizeof(ctypes.c_voidp) * 8), ".dll")) + ) # finds in %PATH% if not libgs: raise RuntimeError("Please make sure that Ghostscript is installed") libgs = windll.LoadLibrary(libgs) diff --git a/camelot/handlers.py b/camelot/handlers.py index b07bea3..c2afc9e 100644 --- a/camelot/handlers.py +++ b/camelot/handlers.py @@ -133,8 +133,7 @@ class PDFHandler(): infile = PdfFileReader(fileobj, strict=False) if infile.isEncrypted: infile.decrypt(self.password) - fpath = build_file_path_in_temp_dir( - "page-{page}.pdf".format(page=page)) + fpath = build_file_path_in_temp_dir(f"page-{page}.pdf") froot, fext = os.path.splitext(fpath) p = infile.getPage(page - 1) outfile = PdfFileWriter() @@ -211,8 +210,7 @@ class PDFHandler(): page_idx, layout_kwargs) if not suppress_stdout: rootname = os.path.basename(parser.rootname) - logger.info( - "Processing {rootname}".format(rootname=rootname)) + logger.info(f"Processing {rootname}") t = parser.extract_tables() tables.extend(t) return TableList(sorted(tables)) diff --git a/camelot/image_processing.py b/camelot/image_processing.py index 9f23430..43e7c65 100644 --- a/camelot/image_processing.py +++ b/camelot/image_processing.py @@ -1,7 +1,5 @@ # -*- coding: utf-8 -*- -from __future__ import division - import cv2 import numpy as np diff --git a/camelot/parsers/lattice.py b/camelot/parsers/lattice.py index 1e6ed30..c12ed35 100644 --- a/camelot/parsers/lattice.py +++ b/camelot/parsers/lattice.py @@ -1,6 +1,5 @@ # -*- coding: utf-8 -*- -from __future__ import division import os from .base import BaseParser diff --git a/camelot/parsers/stream.py b/camelot/parsers/stream.py index 1f8d8c6..ae54c7b 100644 --- a/camelot/parsers/stream.py +++ b/camelot/parsers/stream.py @@ -1,7 +1,5 @@ # -*- coding: utf-8 -*- -from __future__ import division - import warnings from .base import TextBaseParser @@ -167,8 +165,7 @@ class Stream(TextBaseParser): ncols = max(set(elements), key=elements.count) else: warnings.warn( - "No tables found in table area {bbox}".format( - bbox=bbox) + f"No tables found in table area {bbox}" ) cols = [ (t.x0, t.x1) diff --git a/camelot/plotting.py b/camelot/plotting.py index 26aec3e..a0bf5cd 100644 --- a/camelot/plotting.py +++ b/camelot/plotting.py @@ -191,14 +191,11 @@ class PlotMethods(): if table.flavor == "lattice" and kind in ["textedge"]: raise NotImplementedError( - "Lattice flavor does not support kind='{}'".format(kind) + f"Lattice flavor does not support kind='{kind}'" ) if table.flavor != "lattice" and kind in ["line"]: raise NotImplementedError( - "{flavor} flavor does not support kind='{kind}'".format( - flavor=table.flavor, - kind=kind - ) + f"{table.flavor} flavor does not support kind='{kind}'" ) plot_method = getattr(self, kind) diff --git a/camelot/utils.py b/camelot/utils.py index d91d556..f9c2b96 100644 --- a/camelot/utils.py +++ b/camelot/utils.py @@ -1,10 +1,9 @@ # -*- coding: utf-8 -*- -from __future__ import division -import re import os import atexit import sys +import re import random import shutil import string @@ -34,18 +33,9 @@ from pdfminer.layout import ( from .ext.ghostscript import Ghostscript -# pylint: disable=import-error -# PyLint will evaluate both branches, and will necessarily complain about one -# of them. -PY3 = sys.version_info[0] >= 3 -if PY3: - from urllib.request import urlopen - from urllib.parse import urlparse as parse_url - from urllib.parse import uses_relative, uses_netloc, uses_params -else: - from urllib2 import urlopen - from urlparse import urlparse as parse_url - from urlparse import uses_relative, uses_netloc, uses_params +from urllib.request import Request, urlopen +from urllib.parse import urlparse as parse_url +from urllib.parse import uses_relative, uses_netloc, uses_params _VALID_URLS = set(uses_relative + uses_netloc + uses_params) @@ -95,14 +85,12 @@ def download_url(url): Temporary filepath. """ - filename = "{}.pdf".format(random_string(6)) + filename = f"{random_string(6)}.pdf" with tempfile.NamedTemporaryFile("wb", delete=False) as f: - req = Request(url, headers={"User-Agent": "Mozilla/5.0"}) - obj = urlopen(req) - if PY3: - content_type = obj.info().get_content_type() - else: - content_type = obj.info().getheader("Content-Type") + headers = {"User-Agent": "Mozilla/5.0"} + request = Request(url, None, headers) + obj = urlopen(request) + content_type = obj.info().get_content_type() if content_type != "application/pdf": raise NotImplementedError("File format not supported") f.write(obj.read()) @@ -110,6 +98,7 @@ def download_url(url): shutil.move(f.name, filepath) return filepath + common_kwargs = [ "flag_size", "margins", @@ -124,7 +113,7 @@ text_kwargs = common_kwargs + [ "row_tol", "column_tol" ] -lattice_kwargs = common_kwargs+ [ +lattice_kwargs = common_kwargs + [ "process_background", "line_scale", "copy_text", @@ -150,8 +139,7 @@ def validate_input(kwargs, flavor="lattice"): isec = set(kwargs.keys()).difference(set(parser_kwargs)) if isec: raise ValueError( - "{} cannot be used with flavor='{}'".format( - ",".join(sorted(isec)), flavor + f"{",".join(sorted(isec))} cannot be used with flavor='{flavor}'" ) ) @@ -763,7 +751,7 @@ def text_strip(text, strip=""): return text stripped = re.sub( - r"[{}]".format("".join(map(re.escape, strip))), "", text, re.UNICODE + fr"[{''.join(map(re.escape, strip))}]", "", text, re.UNICODE ) return stripped @@ -998,9 +986,7 @@ def get_table_index( text_range = (t.x0, t.x1) col_range = (table.cols[0][0], table.cols[-1][1]) warnings.warn( - "{} {} does not lie in column range {}".format( - text, text_range, col_range - ) + f"{text} {text_range} does not lie in column range {col_range}" ) r_idx = r c_idx = lt_col_overlap.index(max(lt_col_overlap)) diff --git a/requirements.txt b/requirements.txt index 764c037..2d06f47 100755 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,4 @@ +chardet>=3.0.4 click>=6.7 matplotlib>=2.2.3 numpy>=1.13.3 @@ -6,3 +7,4 @@ openpyxl>=2.5.8 pandas>=0.23.4 pdfminer.six>=20200402 PyPDF2>=1.26.0 +Sphinx>=1.7.9 diff --git a/setup.py b/setup.py index e868232..c7d8428 100644 --- a/setup.py +++ b/setup.py @@ -71,9 +71,9 @@ def setup_package(): # Trove classifiers # Full list: https://pypi.python.org/pypi?%3Aaction=list_classifiers # noqa 'License :: OSI Approved :: MIT License', - 'Programming Language :: Python :: 3.5', 'Programming Language :: Python :: 3.6', - 'Programming Language :: Python :: 3.7' + 'Programming Language :: Python :: 3.7', + 'Programming Language :: Python :: 3.8' ]) try: diff --git a/tests/__init__.py b/tests/__init__.py index a946ff7..96c475e 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -1,2 +1,3 @@ import matplotlib -matplotlib.use('agg') + +matplotlib.use("agg") diff --git a/tests/data.py b/tests/data.py index 844fc70..1afb4be 100755 --- a/tests/data.py +++ b/tests/data.py @@ -1,7 +1,5 @@ # -*- coding: utf-8 -*- -from __future__ import unicode_literals - data_stream = [ ["States-A", "Revenue", "", "Capital", "", "Total", "Others(1)", "Total"], diff --git a/tests/test_cli.py b/tests/test_cli.py index fd57eb3..3bb703e 100755 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -140,10 +140,11 @@ def test_cli_password(): def test_cli_output_format(): with TemporaryDirectory() as tempdir: infile = os.path.join(testdir, "health.pdf") - outfile = os.path.join(tempdir, "health.{}") + runner = CliRunner() # json + outfile = os.path.join(tempdir, "health.json") result = runner.invoke( cli, ["--format", "json", "--output", outfile.format("json"), "stream", @@ -152,6 +153,7 @@ def test_cli_output_format(): assert result.exit_code == 0 # excel + outfile = os.path.join(tempdir, "health.xlsx") result = runner.invoke( cli, ["--format", "excel", "--output", outfile.format("xlsx"), "stream", @@ -160,6 +162,7 @@ def test_cli_output_format(): assert result.exit_code == 0 # html + outfile = os.path.join(tempdir, "health.html") result = runner.invoke( cli, ["--format", "html", "--output", outfile.format("html"), "stream", @@ -168,6 +171,7 @@ def test_cli_output_format(): assert result.exit_code == 0 # zip + outfile = os.path.join(tempdir, "health.csv") result = runner.invoke( cli, [ @@ -175,7 +179,7 @@ def test_cli_output_format(): "--format", "csv", "--output", - outfile.format("csv"), + outfile, "stream", infile, ], diff --git a/tests/test_errors.py b/tests/test_errors.py index 835dc46..d74a791 100755 --- a/tests/test_errors.py +++ b/tests/test_errors.py @@ -10,7 +10,7 @@ import camelot testdir = os.path.dirname(os.path.abspath(__file__)) testdir = os.path.join(testdir, "files") -filename = os.path.join(testdir, 'foo.pdf') +filename = os.path.join(testdir, "foo.pdf") def test_unknown_flavor(): @@ -27,15 +27,14 @@ def test_input_kwargs(): def test_unsupported_format(): - message = 'File format not supported' - filename = os.path.join(testdir, 'foo.csv') + message = "File format not supported" + filename = os.path.join(testdir, "foo.csv") with pytest.raises(NotImplementedError, match=message): camelot.read_pdf(filename) def test_stream_equal_length(): - message = ("Length of table_areas and columns" - " should be equal") + message = "Length of table_areas and columns" " should be equal" with pytest.raises(ValueError, match=message): camelot.read_pdf( filename, @@ -46,9 +45,9 @@ def test_stream_equal_length(): def test_image_warning(): - filename = os.path.join(testdir, 'image.pdf') + filename = os.path.join(testdir, "image.pdf") with warnings.catch_warnings(): - warnings.simplefilter('error') + warnings.simplefilter("error") with pytest.raises(UserWarning) as e: camelot.read_pdf(filename) assert str(e.value) == 'page-1 is image-based, camelot only works ' \ @@ -56,47 +55,47 @@ def test_image_warning(): def test_no_tables_found(): - filename = os.path.join(testdir, 'blank.pdf') + filename = os.path.join(testdir, "blank.pdf") with warnings.catch_warnings(): - warnings.simplefilter('error') + warnings.simplefilter("error") with pytest.raises(UserWarning) as e: camelot.read_pdf(filename) assert str(e.value) == 'No tables found on page-1' def test_no_tables_found_logs_suppressed(): - filename = os.path.join(testdir, 'foo.pdf') + filename = os.path.join(testdir, "foo.pdf") with warnings.catch_warnings(): # the test should fail if any warning is thrown - warnings.simplefilter('error') + warnings.simplefilter("error") try: camelot.read_pdf(filename, suppress_stdout=True) except Warning as e: warning_text = str(e) - pytest.fail('Unexpected warning: {}'.format(warning_text)) + pytest.fail(f"Unexpected warning: {warning_text}") def test_no_tables_found_warnings_suppressed(): - filename = os.path.join(testdir, 'blank.pdf') + filename = os.path.join(testdir, "blank.pdf") with warnings.catch_warnings(): # the test should fail if any warning is thrown - warnings.simplefilter('error') + warnings.simplefilter("error") try: camelot.read_pdf(filename, suppress_stdout=True) except Warning as e: warning_text = str(e) - pytest.fail('Unexpected warning: {}'.format(warning_text)) + pytest.fail(f"Unexpected warning: {warning_text}") def test_no_password(): - filename = os.path.join(testdir, 'health_protected.pdf') - message = 'file has not been decrypted' + filename = os.path.join(testdir, "health_protected.pdf") + message = "file has not been decrypted" with pytest.raises(Exception, match=message): camelot.read_pdf(filename) def test_bad_password(): - filename = os.path.join(testdir, 'health_protected.pdf') - message = 'file has not been decrypted' + filename = os.path.join(testdir, "health_protected.pdf") + message = "file has not been decrypted" with pytest.raises(Exception, match=message): camelot.read_pdf(filename, password='wrongpass')