Merge branch 'master' into hybrid-parser

2020-06-14 08:53:43 -07:00 · 2020-06-14 08:53:43 -07:00 · b43aca8ff5
parent 4fb1e93efd 5efbcdcebb
commit b43aca8ff5
21 changed files with 123 additions and 121 deletions
--- a/.github/FUNDING.yml
+++ b/.github/FUNDING.yml
@ -0,0 +1 @@
+open_collective: camelot
--- a/.readthedocs.yml
+++ b/.readthedocs.yml
@ -0,0 +1,24 @@
+# .readthedocs.yml
+# Read the Docs configuration file
+# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
+
+# Required
+version: 2
+
+# Build documentation in the docs/ directory with Sphinx
+sphinx:
+  configuration: docs/conf.py
+
+# Build documentation with MkDocs
+#mkdocs:
+#  configuration: mkdocs.yml
+
+# Optionally build your docs in additional formats such as PDF
+formats:
+  - pdf
+
+# Optionally set the version of Python and requirements required to build your docs
+python:
+  version: 3.8
+  install:
+    - requirements: requirements.txt
--- a/.travis.yml
+++ b/.travis.yml
@ -7,10 +7,6 @@ install:
  - make install
 jobs:
  include:
-    - stage: test
-      script:
-        - make test
-      python: '3.5'
    - stage: test
      script:
        - make test
@ -20,8 +16,13 @@ jobs:
        - make test
      python: '3.7'
      dist: xenial
+    - stage: test
+      script:
+        - make test
+      python: '3.8'
+      dist: xenial
    - stage: coverage
-      python: '3.6'
+      python: '3.8'
      script:
        - make test
        - codecov --verbose
--- a/HISTORY.md
+++ b/HISTORY.md
@ -4,6 +4,18 @@ Release History
 master
 ------

+0.8.0 (2020-05-24)
+------------------
+
+**Improvements**
+
+* Drop Python 2 support!
+    * Remove Python 2.7 and 3.5 support.
+    * Replace all instances of `.format` with f-strings.
+    * Remove all `__future__` imports.
+    * Fix HTTP 403 forbidden exception in read_pdf(url) and remove Python 2 urllib support.
+    * Fix test data.
+
 **Bugfixes**

 * Fix library discovery on Windows. [#32](https://github.com/camelot-dev/camelot/pull/32) by [KOLANICH](https://github.com/KOLANICH).
--- a/camelot/main.py
+++ b/camelot/main.py
@ -1,7 +1,5 @@
 # -*- coding: utf-8 -*-

-from __future__ import absolute_import
-

 __all__ = ("main",)

--- a/camelot/version.py
+++ b/camelot/version.py
@ -1,6 +1,6 @@
 # -*- coding: utf-8 -*-

-VERSION = (0, 7, 3)
+VERSION = (0, 8, 0)
 PRERELEASE = None  # alpha, beta or rc
 REVISION = None

@ -8,9 +8,9 @@ REVISION = None
 def generate_version(version, prerelease=None, revision=None):
    version_parts = [".".join(map(str, version))]
    if prerelease is not None:
-        version_parts.append("-{}".format(prerelease))
+        version_parts.append(f"-{prerelease}")
    if revision is not None:
-        version_parts.append(".{}".format(revision))
+        version_parts.append(f".{revision}")
    return "".join(version_parts)


--- a/camelot/cli.py
+++ b/camelot/cli.py
@ -210,7 +210,7 @@ def lattice(c, *args, **kwargs):
        filepath, pages=pages, flavor="lattice", suppress_stdout=quiet,
        **kwargs
    )
-    click.echo("Found {} tables".format(tables.n))
+    click.echo(f"Found {tables.n} tables")
    if plot_type is not None:
        for table in tables:
            plot(table, kind=plot_type)
@ -304,7 +304,7 @@ def stream(c, *args, **kwargs):
    tables = read_pdf(
        filepath, pages=pages, flavor="stream", suppress_stdout=quiet, **kwargs
    )
-    click.echo("Found {} tables".format(tables.n))
+    click.echo(f"Found {tables.n} tables")
    if plot_type is not None:
        for table in tables:
            plot(table, kind=plot_type)
@ -399,7 +399,7 @@ def network(c, *args, **kwargs):
        filepath, pages=pages, flavor="network",
        suppress_stdout=quiet, **kwargs
    )
-    click.echo("Found {} tables".format(tables.n))
+    click.echo(f"Found {tables.n} tables")
    if plot_type is not None:
        for table in tables:
            plot(table, kind=plot_type)
--- a/camelot/core.py
+++ b/camelot/core.py
@ -68,12 +68,8 @@ class TextAlignment():
    def __repr__(self):
        text_inside = " | ".join(
            map(lambda x: x.get_text(), self.textlines[:2])).replace("\n", "")
-        return "<TextEdge coord={coord} tl={tl_count} " \
-               "textlines text='{text_inside}...'>".format(
-                   coord=self.coord,
-                   tl_count=len(self.textlines),
-                   text_inside=text_inside
-               )
+        return f"<TextEdge coord={self.coord} tl={len(self.textlines)} " \
+               f"textlines text='{text_inside}...'>"

    def register_aligned_textline(self, textline, coord):
        """Update new textline to this alignment, adapting its average."""
@ -116,13 +112,10 @@ class TextEdge(TextAlignment):
        self.is_valid = False

    def __repr__(self):
-        return "<TextEdge x={} y0={} y1={} align={} valid={}>".format(
-            round(self.coord, 2),
-            round(self.y0, 2),
-            round(self.y1, 2),
-            self.align,
-            self.is_valid,
-        )
+        x = round(self.x, 2)
+        y0 = round(self.y0, 2)
+        y1 = round(self.y1, 2)
+        return f"<TextEdge x={x} y0={y0} y1={y1} align={self.align} valid={self.is_valid}>"

    def update_coords(self, x, textline, edge_tol=50):
        """Updates the text edge's x and bottom y coordinates and sets
@ -386,12 +379,11 @@ class Cell():
        self._text = ""

    def __repr__(self):
-        return "<Cell x1={} y1={} x2={} y2={}>".format(
-            round(self.x1, 2),
-            round(self.y1, 2),
-            round(self.x2, 2),
-            round(self.y2, 2)
-        )
+        x1 = round(self.x1, 2)
+        y1 = round(self.y1, 2)
+        x2 = round(self.x2, 2)
+        y2 = round(self.y2, 2)
+        return f"<Cell x1={x1} y1={y1} x2={x2} y2={y2}>"

    @property
    def text(self):
@ -465,7 +457,7 @@ class Table():
        self.textlines = []  # List of actual textlines on the page

    def __repr__(self):
-        return "<{} shape={}>".format(self.__class__.__name__, self.shape)
+        return f"<{self.__class__.__name__} shape={self.shape}>"

    def __lt__(self, other):
        if self.page == other.page:
@ -739,7 +731,7 @@ class Table():

        """
        kw = {
-            "sheet_name": "page-{}-table-{}".format(self.page, self.order),
+            "sheet_name": f"page-{self.page}-table-{self.order}",
            "encoding": "utf-8",
        }
        kw.update(kwargs)
@ -777,7 +769,7 @@ class Table():
        kw = {"if_exists": "replace", "index": False}
        kw.update(kwargs)
        conn = sqlite3.connect(path)
-        table_name = "page-{}-table-{}".format(self.page, self.order)
+        table_name = f"page-{self.page}-table-{self.order}"
        self.df.to_sql(table_name, conn, **kw)
        conn.commit()
        conn.close()
@ -831,7 +823,7 @@ class TableList():
        self._tables = tables

    def __repr__(self):
-        return "<{} n={}>".format(self.__class__.__name__, self.n)
+        return f"<{self.__class__.__name__} n={self.n}>"

    def __len__(self):
        return len(self._tables)
@ -841,7 +833,7 @@ class TableList():

    @staticmethod
    def _format_func(table, f):
-        return getattr(table, "to_{}".format(f))
+        return getattr(table, f"to_{f}")

    @property
    def n(self):
@ -852,10 +844,7 @@ class TableList():
        root = kwargs.get("root")
        ext = kwargs.get("ext")
        for table in self._tables:
-            filename = os.path.join(
-                "{}-page-{}-table-{}{}".format(root, table.page, table.order,
-                                               ext)
-            )
+            filename = f"{root}-page-{table.page}-table-{table.order}{ext}"
            filepath = os.path.join(dirname, filename)
            to_format = self._format_func(table, f)
            to_format(filepath)
@ -868,12 +857,7 @@ class TableList():
        zipname = os.path.join(os.path.dirname(path), root) + ".zip"
        with zipfile.ZipFile(zipname, "w", allowZip64=True) as z:
            for table in self._tables:
-                filename = os.path.join(
-                    "{}-page-{}-table-{}{}".format(root,
-                                                   table.page,
-                                                   table.order,
-                                                   ext)
-                )
+                filename = f"{root}-page-{table.page}-table-{table.order}{ext}"
                filepath = os.path.join(dirname, filename)
                z.write(filepath, os.path.basename(filepath))

@ -907,9 +891,8 @@ class TableList():
            # pylint: disable=abstract-class-instantiated
            writer = pd.ExcelWriter(filepath)
            for table in self._tables:
-                sheet_name = "page-{}-table-{}".format(table.page, table.order)
-                table.df.to_excel(writer, sheet_name=sheet_name,
-                                  encoding="utf-8")
+                sheet_name = f"page-{table.page}-table-{table.order}"
+                table.df.to_excel(writer, sheet_name=sheet_name, encoding="utf-8")
            writer.save()
            if compress:
                zipname = os.path.join(os.path.dirname(path), root) + ".zip"
--- a/camelot/ext/ghostscript/_gsprint.py
+++ b/camelot/ext/ghostscript/_gsprint.py
@ -81,6 +81,7 @@ def delete_instance(instance):
    """
    return libgs.gsapi_delete_instance(instance)

+
 if sys.platform == "win32":
    c_stdstream_call_t = WINFUNCTYPE(c_int, gs_main_instance, POINTER(c_char), c_int)
 else:
@ -247,7 +248,10 @@ if sys.platform == "win32":
    libgs = __win32_finddll()
    if not libgs:
        import ctypes.util
-        libgs = ctypes.util.find_library("".join(("gsdll", str(ctypes.sizeof(ctypes.c_voidp) * 8), ".dll"))) # finds in %PATH%
+
+        libgs = ctypes.util.find_library(
+            "".join(("gsdll", str(ctypes.sizeof(ctypes.c_voidp) * 8), ".dll"))
+        )  # finds in %PATH%
    if not libgs:
        raise RuntimeError("Please make sure that Ghostscript is installed")
    libgs = windll.LoadLibrary(libgs)
--- a/camelot/handlers.py
+++ b/camelot/handlers.py
@ -133,8 +133,7 @@ class PDFHandler():
            infile = PdfFileReader(fileobj, strict=False)
            if infile.isEncrypted:
                infile.decrypt(self.password)
-            fpath = build_file_path_in_temp_dir(
-                "page-{page}.pdf".format(page=page))
+            fpath = build_file_path_in_temp_dir(f"page-{page}.pdf")
            froot, fext = os.path.splitext(fpath)
            p = infile.getPage(page - 1)
            outfile = PdfFileWriter()
@ -211,8 +210,7 @@ class PDFHandler():
                                      page_idx, layout_kwargs)
            if not suppress_stdout:
                rootname = os.path.basename(parser.rootname)
-                logger.info(
-                    "Processing {rootname}".format(rootname=rootname))
+                logger.info(f"Processing {rootname}")
            t = parser.extract_tables()
            tables.extend(t)
        return TableList(sorted(tables))
--- a/camelot/image_processing.py
+++ b/camelot/image_processing.py
@ -1,7 +1,5 @@
 # -*- coding: utf-8 -*-

-from __future__ import division
-
 import cv2
 import numpy as np

--- a/camelot/parsers/lattice.py
+++ b/camelot/parsers/lattice.py
@ -1,6 +1,5 @@
 # -*- coding: utf-8 -*-

-from __future__ import division
 import os

 from .base import BaseParser
--- a/camelot/parsers/stream.py
+++ b/camelot/parsers/stream.py
@ -1,7 +1,5 @@
 # -*- coding: utf-8 -*-

-from __future__ import division
-
 import warnings

 from .base import TextBaseParser
@ -167,8 +165,7 @@ class Stream(TextBaseParser):
                    ncols = max(set(elements), key=elements.count)
                else:
                    warnings.warn(
-                        "No tables found in table area {bbox}".format(
-                            bbox=bbox)
+                        f"No tables found in table area {bbox}"
                    )
            cols = [
                (t.x0, t.x1)
--- a/camelot/plotting.py
+++ b/camelot/plotting.py
@ -191,14 +191,11 @@ class PlotMethods():

        if table.flavor == "lattice" and kind in ["textedge"]:
            raise NotImplementedError(
-                "Lattice flavor does not support kind='{}'".format(kind)
+                f"Lattice flavor does not support kind='{kind}'"
            )
        if table.flavor != "lattice" and kind in ["line"]:
            raise NotImplementedError(
-                "{flavor} flavor does not support kind='{kind}'".format(
-                    flavor=table.flavor,
-                    kind=kind
-                )
+                f"{table.flavor} flavor does not support kind='{kind}'"
            )

        plot_method = getattr(self, kind)
--- a/camelot/utils.py
+++ b/camelot/utils.py
@ -1,10 +1,9 @@
 # -*- coding: utf-8 -*-
-from __future__ import division

-import re
 import os
 import atexit
 import sys
+import re
 import random
 import shutil
 import string
@ -34,18 +33,9 @@ from pdfminer.layout import (

 from .ext.ghostscript import Ghostscript

-# pylint: disable=import-error
-# PyLint will evaluate both branches, and will necessarily complain about one
-# of them.
-PY3 = sys.version_info[0] >= 3
-if PY3:
-    from urllib.request import urlopen
+from urllib.request import Request, urlopen
 from urllib.parse import urlparse as parse_url
 from urllib.parse import uses_relative, uses_netloc, uses_params
-else:
-    from urllib2 import urlopen
-    from urlparse import urlparse as parse_url
-    from urlparse import uses_relative, uses_netloc, uses_params


 _VALID_URLS = set(uses_relative + uses_netloc + uses_params)
@ -95,14 +85,12 @@ def download_url(url):
        Temporary filepath.

    """
-    filename = "{}.pdf".format(random_string(6))
+    filename = f"{random_string(6)}.pdf"
    with tempfile.NamedTemporaryFile("wb", delete=False) as f:
-        req = Request(url, headers={"User-Agent": "Mozilla/5.0"})
-        obj = urlopen(req)
-        if PY3:
+        headers = {"User-Agent": "Mozilla/5.0"}
+        request = Request(url, None, headers)
+        obj = urlopen(request)
        content_type = obj.info().get_content_type()
-        else:
-            content_type = obj.info().getheader("Content-Type")
        if content_type != "application/pdf":
            raise NotImplementedError("File format not supported")
        f.write(obj.read())
@ -110,6 +98,7 @@ def download_url(url):
    shutil.move(f.name, filepath)
    return filepath

+
 common_kwargs = [
    "flag_size",
    "margins",
@ -150,8 +139,7 @@ def validate_input(kwargs, flavor="lattice"):
    isec = set(kwargs.keys()).difference(set(parser_kwargs))
    if isec:
        raise ValueError(
-            "{} cannot be used with flavor='{}'".format(
-                ",".join(sorted(isec)), flavor
+            f"{",".join(sorted(isec))} cannot be used with flavor='{flavor}'"
            )
        )

@ -763,7 +751,7 @@ def text_strip(text, strip=""):
        return text

    stripped = re.sub(
-        r"[{}]".format("".join(map(re.escape, strip))), "", text, re.UNICODE
+        fr"[{''.join(map(re.escape, strip))}]", "", text, re.UNICODE
    )
    return stripped

@ -998,9 +986,7 @@ def get_table_index(
                text_range = (t.x0, t.x1)
                col_range = (table.cols[0][0], table.cols[-1][1])
                warnings.warn(
-                    "{} {} does not lie in column range {}".format(
-                        text, text_range, col_range
-                    )
+                    f"{text} {text_range} does not lie in column range {col_range}"
                )
            r_idx = r
            c_idx = lt_col_overlap.index(max(lt_col_overlap))
--- a/requirements.txt
+++ b/requirements.txt
@ -1,3 +1,4 @@
+chardet>=3.0.4
 click>=6.7
 matplotlib>=2.2.3
 numpy>=1.13.3
@ -6,3 +7,4 @@ openpyxl>=2.5.8
 pandas>=0.23.4
 pdfminer.six>=20200402
 PyPDF2>=1.26.0
+Sphinx>=1.7.9
--- a/setup.py
+++ b/setup.py
@ -71,9 +71,9 @@ def setup_package():
                        # Trove classifiers
                        # Full list: https://pypi.python.org/pypi?%3Aaction=list_classifiers # noqa
                        'License :: OSI Approved :: MIT License',
-                        'Programming Language :: Python :: 3.5',
                        'Programming Language :: Python :: 3.6',
-                        'Programming Language :: Python :: 3.7'
+                        'Programming Language :: Python :: 3.7',
+                        'Programming Language :: Python :: 3.8'
                    ])

    try:
--- a/tests/init.py
+++ b/tests/init.py
@ -1,2 +1,3 @@
 import matplotlib
-matplotlib.use('agg')
+
+matplotlib.use("agg")
--- a/tests/data.py
+++ b/tests/data.py
@ -1,7 +1,5 @@
 # -*- coding: utf-8 -*-

-from __future__ import unicode_literals
-

 data_stream = [
    ["States-A", "Revenue", "", "Capital", "", "Total", "Others(1)", "Total"],
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@ -140,10 +140,11 @@ def test_cli_password():
 def test_cli_output_format():
    with TemporaryDirectory() as tempdir:
        infile = os.path.join(testdir, "health.pdf")
-        outfile = os.path.join(tempdir, "health.{}")
+
        runner = CliRunner()

        # json
+        outfile = os.path.join(tempdir, "health.json")
        result = runner.invoke(
            cli,
            ["--format", "json", "--output", outfile.format("json"), "stream",
@ -152,6 +153,7 @@ def test_cli_output_format():
        assert result.exit_code == 0

        # excel
+        outfile = os.path.join(tempdir, "health.xlsx")
        result = runner.invoke(
            cli,
            ["--format", "excel", "--output", outfile.format("xlsx"), "stream",
@ -160,6 +162,7 @@ def test_cli_output_format():
        assert result.exit_code == 0

        # html
+        outfile = os.path.join(tempdir, "health.html")
        result = runner.invoke(
            cli,
            ["--format", "html", "--output", outfile.format("html"), "stream",
@ -168,6 +171,7 @@ def test_cli_output_format():
        assert result.exit_code == 0

        # zip
+        outfile = os.path.join(tempdir, "health.csv")
        result = runner.invoke(
            cli,
            [
@ -175,7 +179,7 @@ def test_cli_output_format():
                "--format",
                "csv",
                "--output",
-                outfile.format("csv"),
+                outfile,
                "stream",
                infile,
            ],
--- a/tests/test_errors.py
+++ b/tests/test_errors.py
@ -10,7 +10,7 @@ import camelot

 testdir = os.path.dirname(os.path.abspath(__file__))
 testdir = os.path.join(testdir, "files")
-filename = os.path.join(testdir, 'foo.pdf')
+filename = os.path.join(testdir, "foo.pdf")


 def test_unknown_flavor():
@ -27,15 +27,14 @@ def test_input_kwargs():


 def test_unsupported_format():
-    message = 'File format not supported'
-    filename = os.path.join(testdir, 'foo.csv')
+    message = "File format not supported"
+    filename = os.path.join(testdir, "foo.csv")
    with pytest.raises(NotImplementedError, match=message):
        camelot.read_pdf(filename)


 def test_stream_equal_length():
-    message = ("Length of table_areas and columns"
-               " should be equal")
+    message = "Length of table_areas and columns" " should be equal"
    with pytest.raises(ValueError, match=message):
        camelot.read_pdf(
            filename,
@ -46,9 +45,9 @@ def test_stream_equal_length():


 def test_image_warning():
-    filename = os.path.join(testdir, 'image.pdf')
+    filename = os.path.join(testdir, "image.pdf")
    with warnings.catch_warnings():
-        warnings.simplefilter('error')
+        warnings.simplefilter("error")
        with pytest.raises(UserWarning) as e:
            camelot.read_pdf(filename)
        assert str(e.value) == 'page-1 is image-based, camelot only works ' \
@ -56,47 +55,47 @@ def test_image_warning():


 def test_no_tables_found():
-    filename = os.path.join(testdir, 'blank.pdf')
+    filename = os.path.join(testdir, "blank.pdf")
    with warnings.catch_warnings():
-        warnings.simplefilter('error')
+        warnings.simplefilter("error")
        with pytest.raises(UserWarning) as e:
            camelot.read_pdf(filename)
        assert str(e.value) == 'No tables found on page-1'


 def test_no_tables_found_logs_suppressed():
-    filename = os.path.join(testdir, 'foo.pdf')
+    filename = os.path.join(testdir, "foo.pdf")
    with warnings.catch_warnings():
        # the test should fail if any warning is thrown
-        warnings.simplefilter('error')
+        warnings.simplefilter("error")
        try:
            camelot.read_pdf(filename, suppress_stdout=True)
        except Warning as e:
            warning_text = str(e)
-            pytest.fail('Unexpected warning: {}'.format(warning_text))
+            pytest.fail(f"Unexpected warning: {warning_text}")


 def test_no_tables_found_warnings_suppressed():
-    filename = os.path.join(testdir, 'blank.pdf')
+    filename = os.path.join(testdir, "blank.pdf")
    with warnings.catch_warnings():
        # the test should fail if any warning is thrown
-        warnings.simplefilter('error')
+        warnings.simplefilter("error")
        try:
            camelot.read_pdf(filename, suppress_stdout=True)
        except Warning as e:
            warning_text = str(e)
-            pytest.fail('Unexpected warning: {}'.format(warning_text))
+            pytest.fail(f"Unexpected warning: {warning_text}")


 def test_no_password():
-    filename = os.path.join(testdir, 'health_protected.pdf')
-    message = 'file has not been decrypted'
+    filename = os.path.join(testdir, "health_protected.pdf")
+    message = "file has not been decrypted"
    with pytest.raises(Exception, match=message):
        camelot.read_pdf(filename)


 def test_bad_password():
-    filename = os.path.join(testdir, 'health_protected.pdf')
-    message = 'file has not been decrypted'
+    filename = os.path.join(testdir, "health_protected.pdf")
+    message = "file has not been decrypted"
    with pytest.raises(Exception, match=message):
        camelot.read_pdf(filename, password='wrongpass')