diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md new file mode 100644 index 0000000..c45e34f --- /dev/null +++ b/.github/ISSUE_TEMPLATE/bug_report.md @@ -0,0 +1,48 @@ +--- +name: Bug report +about: Please follow this template to submit bug reports. +title: '' +labels: bug +assignees: '' + +--- + + + +**Describe the bug** +A clear and concise description of what the bug is. + +**Steps to reproduce the bug** +Steps used to install `camelot`: +1. Add step here (you can add more steps too) + +Steps to reproduce the behavior: +1. Add step here (you can add more steps too) + +**Expected behavior** +A clear and concise description of what you expected to happen. + +**Code** +Add the Camelot code snippet that you used. +``` +import camelot + +# add your code here +``` + +**PDF** +Add the PDF file that you want to extract tables from. + +**Screenshots** +If applicable, add screenshots to help explain your problem. + +**Environment** + - OS: [e.g. MacOS] + - Python version: + - Numpy version: + - OpenCV version: + - Ghostscript version: + - Camelot version: + +**Additional context** +Add any other context about the problem here. diff --git a/LICENSE b/LICENSE index 9a71f13..da379bb 100644 --- a/LICENSE +++ b/LICENSE @@ -1,12 +1,7 @@ MIT License -Modifications: - -Copyright (c) 2019 Camelot Developers - -Original project: - -Copyright (c) 2018 Peeply Private Ltd (Singapore) +Copyright (c) 2019-2020 Camelot Developers +Copyright (c) 2018-2019 Peeply Private Ltd (Singapore) Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/README.md b/README.md index 74cef27..8324b9f 100644 --- a/README.md +++ b/README.md @@ -10,13 +10,13 @@ [![image](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/ambv/black) [![image](https://img.shields.io/badge/continous%20quality-deepsource-lightgrey)](https://deepsource.io/gh/camelot-dev/camelot/?ref=repository-badge) -**Camelot** is a Python library that makes it easy for *anyone* to extract tables from PDF files! +**Camelot** is a Python library that can help you extract tables from PDFs! -**Note:** You can also check out [Excalibur](https://github.com/camelot-dev/excalibur), which is a web interface for Camelot! +**Note:** You can also check out [Excalibur](https://github.com/camelot-dev/excalibur), the web interface to Camelot! --- -**Here's how you can extract tables from PDF files.** Check out the PDF used in this example [here](https://github.com/camelot-dev/camelot/blob/master/docs/_static/pdf/foo.pdf). +**Here's how you can extract tables from PDFs.** You can check out the PDF used in this example [here](https://github.com/camelot-dev/camelot/blob/master/docs/_static/pdf/foo.pdf).
 >>> import camelot
@@ -46,24 +46,27 @@
 | 2032_2     | 0.17      | 57.8          | 21.7%                | 0.3%            | 2.7%            | 1.2%           |
 | 4171_1     | 0.07      | 173.9         | 58.1%                | 1.6%            | 2.1%            | 0.5%           |
 
-There's a [command-line interface](https://camelot-py.readthedocs.io/en/master/user/cli.html) too!
+Camelot also comes packaged with a [command-line interface](https://camelot-py.readthedocs.io/en/master/user/cli.html)!
 
 **Note:** Camelot only works with text-based PDFs and not scanned documents. (As Tabula [explains](https://github.com/tabulapdf/tabula#why-tabula), "If you can click and drag to select text in your table in a PDF viewer, then your PDF is text-based".)
 
 ## Why Camelot?
 
-- **You are in control.**: Unlike other libraries and tools which either give a nice output or fail miserably (with no in-between), Camelot gives you the power to tweak table extraction. (This is important since everything in the real world, including PDF table extraction, is fuzzy.)
-- *Bad* tables can be discarded based on **metrics** like accuracy and whitespace, without ever having to manually look at each table.
-- Each table is a **pandas DataFrame**, which seamlessly integrates into [ETL and data analysis workflows](https://gist.github.com/vinayak-mehta/e5949f7c2410a0e12f25d3682dc9e873).
-- **Export** to multiple formats, including JSON, Excel, HTML and Sqlite.
+- **Configurability**: Camelot gives you control over the table extraction process with its [tweakable settings](https://camelot-py.readthedocs.io/en/master/user/advanced.html).
+- **Metrics**: Bad tables can be discarded based on metrics like accuracy and whitespace, without having to manually look at each table.
+- **Output**: Each table is extracted into a **pandas DataFrame**, which seamlessly integrates into [ETL and data analysis workflows](https://gist.github.com/vinayak-mehta/e5949f7c2410a0e12f25d3682dc9e873). You can also export tables to multiple formats, which include CSV, JSON, Excel, HTML and Sqlite.
 
-See [comparison with other PDF table extraction libraries and tools](https://github.com/camelot-dev/camelot/wiki/Comparison-with-other-PDF-Table-Extraction-libraries-and-tools).
+See [comparison with similar libraries and tools](https://github.com/camelot-dev/camelot/wiki/Comparison-with-other-PDF-Table-Extraction-libraries-and-tools).
+
+## Support the development
+
+If Camelot has helped you, please consider supporting its development with a one-time or monthly donation [on OpenCollective](https://opencollective.com/camelot).
 
 ## Installation
 
 ### Using conda
 
-The easiest way to install Camelot is to install it with [conda](https://conda.io/docs/), which is a package manager and  environment management system for the [Anaconda](http://docs.continuum.io/anaconda/) distribution.
+The easiest way to install Camelot is with [conda](https://conda.io/docs/), which is a package manager and environment management system for the [Anaconda](http://docs.continuum.io/anaconda/) distribution.
 
 
 $ conda install -c conda-forge camelot-py
@@ -71,7 +74,7 @@ $ conda install -c conda-forge camelot-py
 
 ### Using pip
 
-After [installing the dependencies](https://camelot-py.readthedocs.io/en/master/user/install-deps.html) ([tk](https://packages.ubuntu.com/bionic/python/python-tk) and [ghostscript](https://www.ghostscript.com/)), you can simply use pip to install Camelot:
+After [installing the dependencies](https://camelot-py.readthedocs.io/en/master/user/install-deps.html) ([tk](https://packages.ubuntu.com/bionic/python/python-tk) and [ghostscript](https://www.ghostscript.com/)), you can also just use pip to install Camelot:
 
 
 $ pip install "camelot-py[cv]"
@@ -94,40 +97,16 @@ $ pip install ".[cv]"
 
 ## Documentation
 
-Great documentation is available at [http://camelot-py.readthedocs.io/](http://camelot-py.readthedocs.io/).
-
-## Development
-
-The [Contributor's Guide](https://camelot-py.readthedocs.io/en/master/dev/contributing.html) has detailed information about contributing code, documentation, tests and more. We've included some basic information in this README.
-
-### Source code
-
-You can check the latest sources with:
-
-
-$ git clone https://www.github.com/camelot-dev/camelot
-
- -### Setting up a development environment - -You can install the development dependencies easily, using pip: - -
-$ pip install "camelot-py[dev]"
-
- -### Testing - -After installation, you can run tests using: - -
-$ python setup.py test
-
+The documentation is available at [http://camelot-py.readthedocs.io/](http://camelot-py.readthedocs.io/). ## Wrappers - [camelot-php](https://github.com/randomstate/camelot-php) provides a [PHP](https://www.php.net/) wrapper on Camelot. +## Contributing + +The [Contributor's Guide](https://camelot-py.readthedocs.io/en/master/dev/contributing.html) has detailed information about contributing issues, documentation, code, and tests. + ## Versioning Camelot uses [Semantic Versioning](https://semver.org/). For the available versions, see the tags on this repository. For the changelog, you can check out [HISTORY.md](https://github.com/camelot-dev/camelot/blob/master/HISTORY.md). @@ -135,9 +114,3 @@ Camelot uses [Semantic Versioning](https://semver.org/). For the available versi ## License This project is licensed under the MIT License, see the [LICENSE](https://github.com/camelot-dev/camelot/blob/master/LICENSE) file for details. - -## Support the development - -You can support our work on Camelot with a one-time or monthly donation [on OpenCollective](https://opencollective.com/camelot). Organizations who use camelot can also sponsor the project for an acknowledgement on [our documentation site](https://camelot-py.readthedocs.io/en/master/) and this README. - -Special thanks to all the users, organizations and contributors that support Camelot! diff --git a/camelot/handlers.py b/camelot/handlers.py index 6aa3a31..9ec10bb 100644 --- a/camelot/handlers.py +++ b/camelot/handlers.py @@ -70,7 +70,8 @@ class PDFHandler(object): if pages == "1": page_numbers.append({"start": 1, "end": 1}) else: - infile = PdfFileReader(open(filepath, "rb"), strict=False) + instream = open(filepath, "rb") + infile = PdfFileReader(instream, strict=False) if infile.isEncrypted: infile.decrypt(self.password) if pages == "all": @@ -84,6 +85,7 @@ class PDFHandler(object): page_numbers.append({"start": int(a), "end": int(b)}) else: page_numbers.append({"start": int(r), "end": int(r)}) + instream.close() P = [] for p in page_numbers: P.extend(range(p["start"], p["end"] + 1)) @@ -122,7 +124,8 @@ class PDFHandler(object): if rotation != "": fpath_new = "".join([froot.replace("page", "p"), "_rotated", fext]) os.rename(fpath, fpath_new) - infile = PdfFileReader(open(fpath_new, "rb"), strict=False) + instream = open(fpath_new, "rb") + infile = PdfFileReader(instream, strict=False) if infile.isEncrypted: infile.decrypt(self.password) outfile = PdfFileWriter() @@ -134,6 +137,7 @@ class PDFHandler(object): outfile.addPage(p) with open(fpath, "wb") as f: outfile.write(f) + instream.close() def parse( self, flavor="lattice", suppress_stdout=False, layout_kwargs={}, **kwargs diff --git a/camelot/parsers/stream.py b/camelot/parsers/stream.py index 3749028..39a0464 100644 --- a/camelot/parsers/stream.py +++ b/camelot/parsers/stream.py @@ -121,6 +121,7 @@ class Stream(BaseParser): row_y = 0 rows = [] temp = [] + for t in text: # is checking for upright necessary? # if t.get_text().strip() and all([obj.upright for obj in t._objs if @@ -131,8 +132,10 @@ class Stream(BaseParser): temp = [] row_y = t.y0 temp.append(t) + rows.append(sorted(temp, key=lambda t: t.x0)) - __ = rows.pop(0) # TODO: hacky + if len(rows) > 1: + __ = rows.pop(0) # TODO: hacky return rows @staticmethod @@ -345,43 +348,46 @@ class Stream(BaseParser): else: # calculate mode of the list of number of elements in # each row to guess the number of columns - ncols = max(set(elements), key=elements.count) - if ncols == 1: - # if mode is 1, the page usually contains not tables - # but there can be cases where the list can be skewed, - # try to remove all 1s from list in this case and - # see if the list contains elements, if yes, then use - # the mode after removing 1s - elements = list(filter(lambda x: x != 1, elements)) - if len(elements): - ncols = max(set(elements), key=elements.count) - else: - warnings.warn( - f"No tables found in table area {table_idx + 1}" + if not len(elements): + cols = [(text_x_min, text_x_max)] + else: + ncols = max(set(elements), key=elements.count) + if ncols == 1: + # if mode is 1, the page usually contains not tables + # but there can be cases where the list can be skewed, + # try to remove all 1s from list in this case and + # see if the list contains elements, if yes, then use + # the mode after removing 1s + elements = list(filter(lambda x: x != 1, elements)) + if len(elements): + ncols = max(set(elements), key=elements.count) + else: + warnings.warn( + f"No tables found in table area {table_idx + 1}" + ) + cols = [(t.x0, t.x1) for r in rows_grouped if len(r) == ncols for t in r] + cols = self._merge_columns(sorted(cols), column_tol=self.column_tol) + inner_text = [] + for i in range(1, len(cols)): + left = cols[i - 1][1] + right = cols[i][0] + inner_text.extend( + [ + t + for direction in self.t_bbox + for t in self.t_bbox[direction] + if t.x0 > left and t.x1 < right + ] ) - cols = [(t.x0, t.x1) for r in rows_grouped if len(r) == ncols for t in r] - cols = self._merge_columns(sorted(cols), column_tol=self.column_tol) - inner_text = [] - for i in range(1, len(cols)): - left = cols[i - 1][1] - right = cols[i][0] - inner_text.extend( - [ - t - for direction in self.t_bbox - for t in self.t_bbox[direction] - if t.x0 > left and t.x1 < right - ] - ) - outer_text = [ - t - for direction in self.t_bbox - for t in self.t_bbox[direction] - if t.x0 > cols[-1][1] or t.x1 < cols[0][0] - ] - inner_text.extend(outer_text) - cols = self._add_columns(cols, inner_text, self.row_tol) - cols = self._join_columns(cols, text_x_min, text_x_max) + outer_text = [ + t + for direction in self.t_bbox + for t in self.t_bbox[direction] + if t.x0 > cols[-1][1] or t.x1 < cols[0][0] + ] + inner_text.extend(outer_text) + cols = self._add_columns(cols, inner_text, self.row_tol) + cols = self._join_columns(cols, text_x_min, text_x_max) return cols, rows diff --git a/camelot/utils.py b/camelot/utils.py index 83974f9..2126fbb 100644 --- a/camelot/utils.py +++ b/camelot/utils.py @@ -353,7 +353,7 @@ def text_in_bbox(bbox, text): Returns ------- t_bbox : list - List of PDFMiner text objects that lie inside table. + List of PDFMiner text objects that lie inside table, discarding the overlapping ones """ lb = (bbox[0], bbox[1]) @@ -364,7 +364,97 @@ def text_in_bbox(bbox, text): if lb[0] - 2 <= (t.x0 + t.x1) / 2.0 <= rt[0] + 2 and lb[1] - 2 <= (t.y0 + t.y1) / 2.0 <= rt[1] + 2 ] - return t_bbox + + # Avoid duplicate text by discarding overlapping boxes + rest = {t for t in t_bbox} + for ba in t_bbox: + for bb in rest.copy(): + if ba == bb: + continue + if bbox_intersect(ba, bb): + # if the intersection is larger than 80% of ba's size, we keep the longest + if (bbox_intersection_area(ba, bb) / bbox_area(ba)) > 0.8: + if bbox_longer(bb, ba): + rest.discard(ba) + unique_boxes = list(rest) + + return unique_boxes + + +def bbox_intersection_area(ba, bb) -> float: + """Returns area of the intersection of the bounding boxes of two PDFMiner objects. + + Parameters + ---------- + ba : PDFMiner text object + bb : PDFMiner text object + + Returns + ------- + intersection_area : float + Area of the intersection of the bounding boxes of both objects + + """ + x_left = max(ba.x0, bb.x0) + y_top = min(ba.y1, bb.y1) + x_right = min(ba.x1, bb.x1) + y_bottom = max(ba.y0, bb.y0) + + if x_right < x_left or y_bottom > y_top: + return 0.0 + + intersection_area = (x_right - x_left) * (y_top - y_bottom) + return intersection_area + + +def bbox_area(bb) -> float: + """Returns area of the bounding box of a PDFMiner object. + + Parameters + ---------- + bb : PDFMiner text object + + Returns + ------- + area : float + Area of the bounding box of the object + + """ + return (bb.x1 - bb.x0) * (bb.y1 - bb.y0) + + +def bbox_intersect(ba, bb) -> bool: + """Returns True if the bounding boxes of two PDFMiner objects intersect. + + Parameters + ---------- + ba : PDFMiner text object + bb : PDFMiner text object + + Returns + ------- + overlaps : bool + True if the bounding boxes intersect + + """ + return ba.x1 >= bb.x0 and bb.x1 >= ba.x0 and ba.y1 >= bb.y0 and bb.y1 >= ba.y0 + + +def bbox_longer(ba, bb) -> bool: + """Returns True if the bounding box of the first PDFMiner object is longer or equal to the second. + + Parameters + ---------- + ba : PDFMiner text object + bb : PDFMiner text object + + Returns + ------- + longer : bool + True if the bounding box of the first object is longer or equal + + """ + return (ba.x1 - ba.x0) >= (bb.x1 - bb.x0) def merge_close_lines(ar, line_tol=2): @@ -411,7 +501,7 @@ def text_strip(text, strip=""): return text stripped = re.sub( - fr"[{''.join(map(re.escape, strip))}]", "", text, re.UNICODE + fr"[{''.join(map(re.escape, strip))}]", "", text, flags=re.UNICODE ) return stripped diff --git a/docs/conf.py b/docs/conf.py index ee9f57f..018f393 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -63,7 +63,7 @@ master_doc = 'index' # General information about the project. project = u'Camelot' -copyright = u'2019, Camelot Developers' +copyright = u'2020, Camelot Developers' author = u'Vinayak Mehta' # The version info for the project you're documenting, acts as replacement for diff --git a/docs/index.rst b/docs/index.rst index 950240e..c3e1de4 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -36,15 +36,15 @@ Release v\ |version|. (:ref:`Installation `) .. image:: https://img.shields.io/badge/continous%20quality-deepsource-lightgrey :target: https://deepsource.io/gh/camelot-dev/camelot/?ref=repository-badge -**Camelot** is a Python library that makes it easy for *anyone* to extract tables from PDF files! +**Camelot** is a Python library that can help you extract tables from PDFs! -.. note:: You can also check out `Excalibur`_, which is a web interface for Camelot! +.. note:: You can also check out `Excalibur`_, the web interface to Camelot! .. _Excalibur: https://github.com/camelot-dev/excalibur ---- -**Here's how you can extract tables from PDF files.** Check out the PDF used in this example `here`_. +**Here's how you can extract tables from PDFs.** You can check out the PDF used in this example `here`_. .. _here: _static/pdf/foo.pdf @@ -70,7 +70,7 @@ Release v\ |version|. (:ref:`Installation `) .. csv-table:: :file: _static/csv/foo.csv -There's a :ref:`command-line interface ` too! +Camelot also comes packaged with a :ref:`command-line interface `! .. note:: Camelot only works with text-based PDFs and not scanned documents. (As Tabula `explains`_, "If you can click and drag to select text in your table in a PDF viewer, then your PDF is text-based".) @@ -79,27 +79,27 @@ There's a :ref:`command-line interface ` too! Why Camelot? ------------ -- **You are in control.** Unlike other libraries and tools which either give a nice output or fail miserably (with no in-between), Camelot gives you the power to tweak table extraction. (This is important since everything in the real world, including PDF table extraction, is fuzzy.) -- *Bad* tables can be discarded based on **metrics** like accuracy and whitespace, without ever having to manually look at each table. -- Each table is a **pandas DataFrame**, which seamlessly integrates into `ETL and data analysis workflows`_. -- **Export** to multiple formats, including JSON, Excel and HTML. - -See `comparison with other PDF table extraction libraries and tools`_. +- **Configurability**: Camelot gives you control over the table extraction process with its :ref:`tweakable settings `. +- **Metrics**: Bad tables can be discarded based on metrics like accuracy and whitespace, without having to manually look at each table. +- **Output**: Each table is extracted into a **pandas DataFrame**, which seamlessly integrates into `ETL and data analysis workflows`_. You can also export tables to multiple formats, which include CSV, JSON, Excel, HTML and Sqlite. .. _ETL and data analysis workflows: https://gist.github.com/vinayak-mehta/e5949f7c2410a0e12f25d3682dc9e873 -.. _comparison with other PDF table extraction libraries and tools: https://github.com/camelot-dev/camelot/wiki/Comparison-with-other-PDF-Table-Extraction-libraries-and-tools -Support us on OpenCollective ----------------------------- +See `comparison with similar libraries and tools`_. -If Camelot helped you extract tables from PDFs, please consider supporting its development by `becoming a backer or a sponsor on OpenCollective`_! +.. _comparison with similar libraries and tools: https://github.com/camelot-dev/camelot/wiki/Comparison-with-other-PDF-Table-Extraction-libraries-and-tools -.. _becoming a backer or a sponsor on OpenCollective: https://opencollective.com/camelot +Support the development +----------------------- + +If Camelot has helped you, please consider supporting its development with a one-time or monthly donation `on OpenCollective`_! + +.. _on OpenCollective: https://opencollective.com/camelot The User Guide -------------- -This part of the documentation begins with some background information about why Camelot was created, takes a small dip into the implementation details and then focuses on step-by-step instructions for getting the most out of Camelot. +This part of the documentation begins with some background information about why Camelot was created, takes you through some implementation details, and then focuses on step-by-step instructions for getting the most out of Camelot. .. toctree:: :maxdepth: 2 @@ -115,8 +115,7 @@ This part of the documentation begins with some background information about why The API Documentation/Guide --------------------------- -If you are looking for information on a specific function, class, or method, -this part of the documentation is for you. +If you are looking for information on a specific function, class, or method, this part of the documentation is for you. .. toctree:: :maxdepth: 2 @@ -126,8 +125,7 @@ this part of the documentation is for you. The Contributor Guide --------------------- -If you want to contribute to the project, this part of the documentation is for -you. +If you want to contribute to the project, this part of the documentation is for you. .. toctree:: :maxdepth: 2 diff --git a/docs/user/install-deps.rst b/docs/user/install-deps.rst index f0ce2ae..461a1d3 100755 --- a/docs/user/install-deps.rst +++ b/docs/user/install-deps.rst @@ -3,72 +3,59 @@ Installation of dependencies ============================ -The dependencies `Tkinter`_ and `ghostscript`_ can be installed using your system's package manager. You can run one of the following, based on your OS. - -.. _Tkinter: https://wiki.python.org/moin/TkInter -.. _ghostscript: https://www.ghostscript.com +The dependencies `Ghostscript `_ and `Tkinter `_ can be installed using your system's package manager or by running their installer. OS-specific instructions ------------------------ -For Ubuntu -^^^^^^^^^^ +Ubuntu +^^^^^^ :: - $ apt install python-tk ghostscript + $ apt install ghostscript python3-tk -Or for Python 3:: - - $ apt install python3-tk ghostscript - -For macOS -^^^^^^^^^ +MacOS +^^^^^ :: - $ brew install tcl-tk ghostscript + $ brew install ghostscript tcl-tk -For Windows -^^^^^^^^^^^ +Windows +^^^^^^^ -For Tkinter, you can download the `ActiveTcl Community Edition`_ from ActiveState. For ghostscript, you can get the installer at the `ghostscript downloads page`_. +For Ghostscript, you can get the installer at their `downloads page `_. And for Tkinter, you can download the `ActiveTcl Community Edition `_ from ActiveState. -.. _ActiveTcl Community Edition: https://www.activestate.com/activetcl/downloads -.. _ghostscript downloads page: https://www.ghostscript.com/download/gsdnld.html -.. _as shown here: https://java.com/en/download/help/path.xml +Checks to see if dependencies are installed correctly +----------------------------------------------------- -Checks to see if dependencies were installed correctly ------------------------------------------------------- +You can run the following checks to see if the dependencies were installed correctly. -You can do the following checks to see if the dependencies were installed correctly. +For Ghostscript +^^^^^^^^^^^^^^^ + +Open the Python REPL and run the following: + +For Ubuntu/MacOS:: + + >>> from ctypes.util import find_library + >>> find_library("gs") + "libgs.so.9" + +For Windows:: + + >>> from ctypes.util import find_library + >>> find_library("".join(("gsdll", str(ctypes.sizeof(ctypes.c_voidp) * 8), ".dll")) + + +**Check:** The output of the ``find_library`` function should not be empty. + +If the output is empty, then it's possible that the Ghostscript library is not available one of the ``LD_LIBRARY_PATH``/``DYLD_LIBRARY_PATH``/``PATH`` variables depending on your operating system. In this case, you may have to modify one of those path variables. For Tkinter ^^^^^^^^^^^ -Launch Python, and then at the prompt, type:: - - >>> import Tkinter - -Or in Python 3:: +Launch Python and then import Tkinter:: >>> import tkinter -If you have Tkinter, Python will not print an error message, and if not, you will see an ``ImportError``. - -For ghostscript -^^^^^^^^^^^^^^^ - -Run the following to check the ghostscript version. - -For Ubuntu/macOS:: - - $ gs -version - -For Windows:: - - C:\> gswin64c.exe -version - -Or for Windows 32-bit:: - - C:\> gswin32c.exe -version - -If you have ghostscript, you should see the ghostscript version and copyright information. +**Check:** Importing ``tkinter`` should not raise an import error. diff --git a/docs/user/install.rst b/docs/user/install.rst index 422f118..e7bf89e 100644 --- a/docs/user/install.rst +++ b/docs/user/install.rst @@ -5,42 +5,35 @@ Installation of Camelot This part of the documentation covers the steps to install Camelot. -Using conda ------------ +After :ref:`installing the dependencies `, which include `Ghostscript `_ and `Tkinter `_, you can use one of the following methods to install Camelot: -The easiest way to install Camelot is to install it with `conda`_, which is a package manager and environment management system for the `Anaconda`_ distribution. -:: +.. warning:: The ``lattice`` flavor will fail to run if Ghostscript is not installed. You may run into errors as shown in `issue #193 `_. - $ conda install -c conda-forge camelot-py +pip +--- -.. note:: Camelot is available for Python 2.7, 3.5, 3.6 and 3.7 on Linux, macOS and Windows. For Windows, you will need to install ghostscript which you can get from their `downloads page`_. - -.. _conda: https://conda.io/docs/ -.. _Anaconda: http://docs.continuum.io/anaconda/ -.. _downloads page: https://www.ghostscript.com/download/gsdnld.html -.. _conda-forge: https://conda-forge.org/ - -Using pip ---------- - -After :ref:`installing the dependencies `, which include `Tkinter`_ and `ghostscript`_, you can simply use pip to install Camelot:: +To install Camelot from PyPI using ``pip``, please include the extra ``cv`` requirement as shown:: $ pip install "camelot-py[cv]" -.. _Tkinter: https://wiki.python.org/moin/TkInter -.. _ghostscript: https://www.ghostscript.com +conda +----- + +`conda`_ is a package manager and environment management system for the `Anaconda `_ distribution. It can be used to install Camelot from the ``conda-forge`` channel:: + + $ conda install -c conda-forge camelot-py From the source code -------------------- -After :ref:`installing the dependencies `, you can install from the source by: +After :ref:`installing the dependencies `, you can install Camelot from source by: 1. Cloning the GitHub repository. :: $ git clone https://www.github.com/camelot-dev/camelot -2. Then simply using pip again. +2. And then simply using pip again. :: $ cd camelot diff --git a/tests/data.py b/tests/data.py index 7e53792..b2bf706 100755 --- a/tests/data.py +++ b/tests/data.py @@ -2798,3 +2798,51 @@ data_stream_layout_kwargs = [ ["A.O.P Cornas", ""], ["Domaine Lionnet « Terre Brûlée » 2012", "15 €"], ] + +data_stream_duplicated_text = [ + ['', '2012 BETTER VARIETIES Harvest Report for Minnesota Central [ MNCE ]', '', '', '', '', '', '', '', '', + 'ALL SEASON TEST'], + ['', 'Doug Toreen, Renville County, MN 55310 [ BIRD ISLAND ]', '', '', '', '', '', '', '', '', + '1.3 - 2.0 MAT. GROUP'], + ['PREV. CROP/HERB:', 'Corn / Surpass, Roundup', '', '', '', '', '', '', '', '', 'S2MNCE01'], + ['SOIL DESCRIPTION:', '', 'Canisteo clay loam, mod. well drained, non-irrigated', '', '', '', '', '', '', '', ''], + ['SOIL CONDITIONS:', '', 'High P, high K, 6.7 pH, 3.9% OM, Low SCN', '', '', '', '', '', '', '', '30" ROW SPACING'], + ['TILLAGE/CULTIVATION:', 'conventional w/ fall till', '', '', '', '', '', '', '', '', ''], + ['PEST MANAGEMENT:', 'Roundup twice', '', '', '', '', '', '', '', '', ''], + ['SEEDED - RATE:', 'May 15', '140 000 /A', '', '', '', '', '', '', 'TOP 30 for YIELD of 63 TESTED', ''], + ['HARVESTED - STAND:', 'Oct 3', '122 921 /A', '', '', '', '', '', '', 'AVERAGE of (3) REPLICATIONS', ''], + ['', '', '', '', 'SCN', 'Seed', 'Yield', 'Moisture', 'Lodging', 'Stand', 'Gross'], + ['Company/Brand', 'Product/Brand†', 'Technol.†', 'Mat.', 'Resist.', 'Trmt.†', 'Bu/A', '%', '%', '(x 1000)', + 'Income'], ['Kruger', 'K2 1901', 'RR2Y', '1.9', 'R', 'Ac,PV', '56.4', '7.6', '0', '126.3', '$846'], + ['Stine', '19RA02 §', 'RR2Y', '1.9', 'R', 'CMB', '55.3', '7.6', '0', '120.0', '$830'], + ['Wensman', 'W 3190NR2', 'RR2Y', '1.9', 'R', 'Ac', '54.5', '7.6', '0', '119.5', '$818'], + ['Hefty', 'H17Y12', 'RR2Y', '1.7', 'MR', 'I', '53.7', '7.7', '0', '124.4', '$806'], + ['Dyna-Gro', 'S15RY53', 'RR2Y', '1.5', 'R', 'Ac', '53.6', '7.7', '0', '126.8', '$804'], + ['LG Seeds', 'C2050R2', 'RR2Y', '2.1', 'R', 'Ac', '53.6', '7.7', '0', '123.9', '$804'], + ['Titan Pro', '19M42', 'RR2Y', '1.9', 'R', 'CMB', '53.6', '7.7', '0', '121.0', '$804'], + ['Stine', '19RA02 (2) §', 'RR2Y', '1.9', 'R', 'CMB', '53.4', '7.7', '0', '123.9', '$801'], + ['Asgrow', 'AG1832 §', 'RR2Y', '1.8', 'MR', 'Ac,PV', '52.9', '7.7', '0', '122.0', '$794'], + ['Prairie Brand', 'PB-1566R2', 'RR2Y', '1.5', 'R', 'CMB', '52.8', '7.7', '0', '122.9', '$792'], + ['Channel', '1901R2', 'RR2Y', '1.9', 'R', 'Ac,PV', '52.8', '7.6', '0', '123.4', '$791'], + ['Titan Pro', '20M1', 'RR2Y', '2.0', 'R', 'Am', '52.5', '7.5', '0', '124.4', '$788'], + ['Kruger', 'K2-2002', 'RR2Y', '2.0', 'R', 'Ac,PV', '52.4', '7.9', '0', '125.4', '$786'], + ['Channel', '1700R2', 'RR2Y', '1.7', 'R', 'Ac,PV', '52.3', '7.9', '0', '123.9', '$784'], + ['Hefty', 'H16Y11', 'RR2Y', '1.6', 'MR', 'I', '51.4', '7.6', '0', '123.9', '$771'], + ['Anderson', '162R2Y', 'RR2Y', '1.6', 'R', 'None', '51.3', '7.5', '0', '119.5', '$770'], + ['Titan Pro', '15M22', 'RR2Y', '1.5', 'R', 'CMB', '51.3', '7.8', '0', '125.4', '$769'], + ['Dairyland', 'DSR-1710R2Y', 'RR2Y', '1.7', 'R', 'CMB', '51.3', '7.7', '0', '122.0', '$769'], + ['Hefty', 'H20R3', 'RR2Y', '2.0', 'MR', 'I', '50.5', '8.2', '0', '121.0', '$757'], + ['Prairie Brand', 'PB 1743R2', 'RR2Y', '1.7', 'R', 'CMB', '50.2', '7.7', '0', '125.8', '$752'], + ['Gold Country', '1741', 'RR2Y', '1.7', 'R', 'Ac', '50.1', '7.8', '0', '123.9', '$751'], + ['Trelay', '20RR43', 'RR2Y', '2.0', 'R', 'Ac,Ex', '49.9', '7.6', '0', '127.8', '$749'], + ['Hefty', 'H14R3', 'RR2Y', '1.4', 'MR', 'I', '49.7', '7.7', '0', '122.9', '$746'], + ['Prairie Brand', 'PB-2099NRR2', 'RR2Y', '2.0', 'R', 'CMB', '49.6', '7.8', '0', '126.3', '$743'], + ['Wensman', 'W 3174NR2', 'RR2Y', '1.7', 'R', 'Ac', '49.3', '7.6', '0', '122.5', '$740'], + ['Kruger', 'K2 1602', 'RR2Y', '1.6', 'R', 'Ac,PV', '48.7', '7.6', '0', '125.4', '$731'], + ['NK Brand', 'S18-C2 §', 'RR2Y', '1.8', 'R', 'CMB', '48.7', '7.7', '0', '126.8', '$731'], + ['Kruger', 'K2 1902', 'RR2Y', '1.9', 'R', 'Ac,PV', '48.7', '7.5', '0', '124.4', '$730'], + ['Prairie Brand', 'PB-1823R2', 'RR2Y', '1.8', 'R', 'None', '48.5', '7.6', '0', '121.0', '$727'], + ['Gold Country', '1541', 'RR2Y', '1.5', 'R', 'Ac', '48.4', '7.6', '0', '110.4', '$726'], + ['', '', '', '', '', 'Test Average =', '47.6', '7.7', '0', '122.9', '$713'], + ['', '', '', '', '', 'LSD (0.10) =', '5.7', '0.3', 'ns', '37.8', '566.4'] +] diff --git a/tests/files/birdisland.pdf b/tests/files/birdisland.pdf new file mode 100644 index 0000000..1501158 Binary files /dev/null and b/tests/files/birdisland.pdf differ diff --git a/tests/files/blank.pdf b/tests/files/blank.pdf deleted file mode 100755 index 99540f1..0000000 Binary files a/tests/files/blank.pdf and /dev/null differ diff --git a/tests/files/empty.pdf b/tests/files/empty.pdf new file mode 100644 index 0000000..52aeefb Binary files /dev/null and b/tests/files/empty.pdf differ diff --git a/tests/files/only_page_number.pdf b/tests/files/only_page_number.pdf new file mode 100644 index 0000000..7b4ecfe Binary files /dev/null and b/tests/files/only_page_number.pdf differ diff --git a/tests/test_cli.py b/tests/test_cli.py index cddc9a2..f897315 100755 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -160,8 +160,8 @@ def test_cli_output_format(): def test_cli_quiet(): with TemporaryDirectory() as tempdir: - infile = os.path.join(testdir, "blank.pdf") - outfile = os.path.join(tempdir, "blank.csv") + infile = os.path.join(testdir, "empty.pdf") + outfile = os.path.join(tempdir, "empty.csv") runner = CliRunner() result = runner.invoke( diff --git a/tests/test_common.py b/tests/test_common.py index 6fadc9d..cb9a968 100644 --- a/tests/test_common.py +++ b/tests/test_common.py @@ -314,3 +314,11 @@ def test_version_generation_with_prerelease_revision(): generate_version(version, prerelease=prerelease, revision=revision) == "0.7.3-alpha.2" ) + + +def test_stream_duplicated_text(): + df = pd.DataFrame(data_stream_duplicated_text) + + filename = os.path.join(testdir, "birdisland.pdf") + tables = camelot.read_pdf(filename, flavor="stream") + assert_frame_equal(df, tables[0].df) diff --git a/tests/test_errors.py b/tests/test_errors.py index 2849110..595c54b 100755 --- a/tests/test_errors.py +++ b/tests/test_errors.py @@ -55,15 +55,33 @@ def test_image_warning(): ) -def test_no_tables_found(): - filename = os.path.join(testdir, "blank.pdf") +def test_lattice_no_tables_on_page(): + filename = os.path.join(testdir, "empty.pdf") with warnings.catch_warnings(): warnings.simplefilter("error") with pytest.raises(UserWarning) as e: - tables = camelot.read_pdf(filename) + tables = camelot.read_pdf(filename, flavor="lattice") assert str(e.value) == "No tables found on page-1" +def test_stream_no_tables_on_page(): + filename = os.path.join(testdir, "empty.pdf") + with warnings.catch_warnings(): + warnings.simplefilter("error") + with pytest.raises(UserWarning) as e: + tables = camelot.read_pdf(filename, flavor="stream") + assert str(e.value) == "No tables found on page-1" + + +def test_stream_no_tables_in_area(): + filename = os.path.join(testdir, "only_page_number.pdf") + with warnings.catch_warnings(): + warnings.simplefilter("error") + with pytest.raises(UserWarning) as e: + tables = camelot.read_pdf(filename, flavor="stream") + assert str(e.value) == "No tables found in table area 1" + + def test_no_tables_found_logs_suppressed(): filename = os.path.join(testdir, "foo.pdf") with warnings.catch_warnings(): @@ -77,7 +95,7 @@ def test_no_tables_found_logs_suppressed(): def test_no_tables_found_warnings_suppressed(): - filename = os.path.join(testdir, "blank.pdf") + filename = os.path.join(testdir, "empty.pdf") with warnings.catch_warnings(): # the test should fail if any warning is thrown warnings.simplefilter("error")