From 5c3a686ebedc933e9055ba9ea947d5931fdfde47 Mon Sep 17 00:00:00 2001 From: anakin87 <44616784+anakin87@users.noreply.github.com> Date: Tue, 8 Dec 2020 18:57:41 +0100 Subject: [PATCH 01/18] Introduce Faq Introduced faq. Started with reducing memory usage. --- docs/user/faq.rst | 46 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) create mode 100644 docs/user/faq.rst diff --git a/docs/user/faq.rst b/docs/user/faq.rst new file mode 100644 index 0000000..8f3b59e --- /dev/null +++ b/docs/user/faq.rst @@ -0,0 +1,46 @@ +.. _faq: + +FAQ +=== + +This part of the documentation answers some common questions. If you want to add some questions you can simply open an issue `here `_. + + +How to reduce memory usage for long PDFs? +--------------------------------------------------- + +During table extraction from long PDF documents, RAM usage can grow significantly. + +A simple workaround is to divide the extraction into some chunks (for example, chunks of 50 pages); at the end of every chunk extraction, data are saved to disk. + +For more information, refer to this code snippet from `@anakin87 `_: + +.. code-block:: python3 + + import camelot + + def chunks(l, n): + """Yield successive n-sized chunks from l.""" + for i in range(0, len(l), n): + yield l[i:i + n] + + def extract_tables_with_less_memory_usage(filepath, pages, params={}, + export_path='.', chunks_length=50): + """ + Control page number + and subdivide the extraction work into n-pages parts (chunks_length). + At the end of every part, save the data on disk and free ram + """ + + # get list of document pages from Camelot handler + handler=camelot.handlers.PDFHandler(filepath) + pages_list=handler._get_pages(filepath,pages=pages) + + # chunk pages list + pages_chunks=list(chunks(pages_list,chunks_length)) + + # extraction and export + for chunk in pages_chunks: + pages_string=str(chunk).replace('[','').replace(']','') + tables = camelot.read_pdf(filepath, pages=pages_string,**params) + tables.export(f'{export_path}/tables.json',f='json') From 0dee3855785eeb52f4123d87f94a11a6c62f4918 Mon Sep 17 00:00:00 2001 From: Arnie97 Date: Thu, 17 Dec 2020 22:12:24 +0800 Subject: [PATCH 02/18] Add line_overlap and boxes_flow to LAParams --- camelot/utils.py | 10 ++++++++-- docs/user/advanced.rst | 2 +- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/camelot/utils.py b/camelot/utils.py index 2126fbb..dae4acb 100644 --- a/camelot/utils.py +++ b/camelot/utils.py @@ -838,23 +838,27 @@ def compute_whitespace(d): def get_page_layout( filename, + line_overlap=0.5, char_margin=1.0, line_margin=0.5, word_margin=0.1, + boxes_flow=0.5, detect_vertical=True, all_texts=True, ): """Returns a PDFMiner LTPage object and page dimension of a single - page pdf. See https://euske.github.io/pdfminer/ to get definitions - of kwargs. + page pdf. To get the definitions of kwargs, see + https://pdfminersix.rtfd.io/en/latest/reference/composable.html. Parameters ---------- filename : string Path to pdf file. + line_overlap : float char_margin : float line_margin : float word_margin : float + boxes_flow : float detect_vertical : bool all_texts : bool @@ -872,9 +876,11 @@ def get_page_layout( if not document.is_extractable: raise PDFTextExtractionNotAllowed(f"Text extraction is not allowed: {filename}") laparams = LAParams( + line_overlap=line_overlap, char_margin=char_margin, line_margin=line_margin, word_margin=word_margin, + boxes_flow=boxes_flow, detect_vertical=detect_vertical, all_texts=all_texts, ) diff --git a/docs/user/advanced.rst b/docs/user/advanced.rst index b482022..662a7b1 100644 --- a/docs/user/advanced.rst +++ b/docs/user/advanced.rst @@ -618,7 +618,7 @@ Tweak layout generation Camelot is built on top of PDFMiner's functionality of grouping characters on a page into words and sentences. In some cases (such as `#170 `_ and `#215 `_), PDFMiner can group characters that should belong to the same sentence into separate sentences. -To deal with such cases, you can tweak PDFMiner's `LAParams kwargs `_ to improve layout generation, by passing the keyword arguments as a dict using ``layout_kwargs`` in :meth:`read_pdf() `. To know more about the parameters you can tweak, you can check out `PDFMiner docs `_. +To deal with such cases, you can tweak PDFMiner's `LAParams kwargs `_ to improve layout generation, by passing the keyword arguments as a dict using ``layout_kwargs`` in :meth:`read_pdf() `. To know more about the parameters you can tweak, you can check out `PDFMiner docs `_. :: From b5cf8a235daefded71df3d7426170930f11b6be1 Mon Sep 17 00:00:00 2001 From: Vinayak Mehta Date: Tue, 15 Jun 2021 02:29:25 +0530 Subject: [PATCH 03/18] Add github test workflow and remove travis --- .github/workflows/test.yml | 23 +++++++++++++++++++++++ .travis.yml | 29 ----------------------------- 2 files changed, 23 insertions(+), 29 deletions(-) create mode 100644 .github/workflows/test.yml delete mode 100755 .travis.yml diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml new file mode 100644 index 0000000..5ad8c68 --- /dev/null +++ b/.github/workflows/test.yml @@ -0,0 +1,23 @@ +name: camelot + +on: [pull_request] + +jobs: + build: + runs-on: ubuntu-latest + strategy: + matrix: + python-version: [3.6, 3.7, 3.8, 3.9] + + steps: + - uses: actions/checkout@v2 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v2 + with: + python-version: ${{ matrix.python-version }} + - name: Install camelot with dependencies + run: | + make install + - name: Test with pytest + run: | + make test diff --git a/.travis.yml b/.travis.yml deleted file mode 100755 index e370649..0000000 --- a/.travis.yml +++ /dev/null @@ -1,29 +0,0 @@ -sudo: true -language: python -cache: pip -addons: - apt: - update: true -install: - - make install -jobs: - include: - - stage: test - script: - - make test - python: '3.6' - - stage: test - script: - - make test - python: '3.7' - dist: xenial - - stage: test - script: - - make test - python: '3.8' - dist: xenial - - stage: coverage - python: '3.8' - script: - - make test - - codecov --verbose From 0c9504e1bc4a905572642855f40fadbe16e4e007 Mon Sep 17 00:00:00 2001 From: Vinayak Mehta Date: Tue, 15 Jun 2021 02:36:18 +0530 Subject: [PATCH 04/18] Fix workflow and job names --- .github/workflows/test.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 5ad8c68..350c136 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -1,9 +1,9 @@ -name: camelot +name: build on: [pull_request] jobs: - build: + test: runs-on: ubuntu-latest strategy: matrix: From cf954a7f6d733377c3c53118a82002d02eec15ac Mon Sep 17 00:00:00 2001 From: Vinayak Mehta Date: Tue, 15 Jun 2021 02:41:20 +0530 Subject: [PATCH 05/18] Rename file and fix badge --- .github/workflows/{test.yml => tests.yml} | 2 +- README.md | 7 +++---- 2 files changed, 4 insertions(+), 5 deletions(-) rename .github/workflows/{test.yml => tests.yml} (97%) diff --git a/.github/workflows/test.yml b/.github/workflows/tests.yml similarity index 97% rename from .github/workflows/test.yml rename to .github/workflows/tests.yml index 350c136..09254e6 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/tests.yml @@ -1,4 +1,4 @@ -name: build +name: tests on: [pull_request] diff --git a/README.md b/README.md index 8324b9f..a3f7f88 100644 --- a/README.md +++ b/README.md @@ -1,14 +1,13 @@ -

+tests

# Camelot: PDF Table Extraction for Humans -[![Build Status](https://travis-ci.org/camelot-dev/camelot.svg?branch=master)](https://travis-ci.org/camelot-dev/camelot) [![Documentation Status](https://readthedocs.org/projects/camelot-py/badge/?version=master)](https://camelot-py.readthedocs.io/en/master/) +![Build Status](https://github.com/camelot-dev/camelot/actions/workflows/tests.yml/badge.svg) [![Documentation Status](https://readthedocs.org/projects/camelot-py/badge/?version=master)](https://camelot-py.readthedocs.io/en/master/) [![codecov.io](https://codecov.io/github/camelot-dev/camelot/badge.svg?branch=master&service=github)](https://codecov.io/github/camelot-dev/camelot?branch=master) [![image](https://img.shields.io/pypi/v/camelot-py.svg)](https://pypi.org/project/camelot-py/) [![image](https://img.shields.io/pypi/l/camelot-py.svg)](https://pypi.org/project/camelot-py/) [![image](https://img.shields.io/pypi/pyversions/camelot-py.svg)](https://pypi.org/project/camelot-py/) [![Gitter chat](https://badges.gitter.im/camelot-dev/Lobby.png)](https://gitter.im/camelot-dev/Lobby) -[![image](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/ambv/black) [![image](https://img.shields.io/badge/continous%20quality-deepsource-lightgrey)](https://deepsource.io/gh/camelot-dev/camelot/?ref=repository-badge) - +[![image](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/ambv/black) **Camelot** is a Python library that can help you extract tables from PDFs! From f7c14bf1d4563c3739d30f27b97324dce062acbf Mon Sep 17 00:00:00 2001 From: Vinayak Mehta Date: Tue, 15 Jun 2021 03:28:23 +0530 Subject: [PATCH 06/18] Update HISTORY.md --- HISTORY.md | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/HISTORY.md b/HISTORY.md index 0a8d6b3..2fd1122 100755 --- a/HISTORY.md +++ b/HISTORY.md @@ -4,6 +4,26 @@ Release History master ------ +**Bugfixes** + +- [#15](https://github.com/camelot-dev/camelot/issues/15) Fix duplicate strings being assigned to the same cell. [#206](https://github.com/camelot-dev/camelot/pull/206) by [Eduardo Gonzalez Lopez de Murillas](https://github.com/edugonza). +- Save plot when filename is specified. [#121](https://github.com/camelot-dev/camelot/pull/121) by [Jens Diemer](https://github.com/jedie). +- Close file streams explicitly. [#202](https://github.com/camelot-dev/camelot/pull/202) by [Martin Abente Lahaye](https://github.com/tchx84). +- Use correct re.sub signature. [#186](https://github.com/camelot-dev/camelot/pull/186) by [pevisscher](https://github.com/pevisscher). +- [#183](https://github.com/camelot-dev/camelot/issues/183) Fix UnicodeEncodeError when using Stream flavor by adding encoding kwarg to `to_html`. [#188](https://github.com/camelot-dev/camelot/pull/188) by [Stefano Fiorucci](https://github.com/anakin87). +- [#179](https://github.com/camelot-dev/camelot/issues/179) Fix `max() arg is an empty sequence` error on PDFs with blank pages. [#189](https://github.com/camelot-dev/camelot/pull/189) by Vinayak Mehta. + +**Improvements** + +- [Add bug report template.](https://github.com/camelot-dev/camelot/commit/0a3944e54d133b701edfe9c7546ff11289301ba8) +- Move from [Travis to GitHub Actions](https://github.com/camelot-dev/camelot/pull/241). +- Update `.readthedocs.yml` and [remove requirements.txt](https://github.com/camelot-dev/camelot/commit/7ab5db39d07baa4063f975e9e00f6073340e04c1#diff-cde814ef2f549dc093f5b8fc533b7e8f47e7b32a8081e0760e57d5c25a1139d9) + +**Documentation** + +- [#193](https://github.com/camelot-dev/camelot/issues/193) Add better checks to confirm proper installation of ghostscript. [#196](https://github.com/camelot-dev/camelot/pull/196) by [jimhall](https://github.com/jimhall). +- Update `advanced.rst` plotting examples. [#119](https://github.com/camelot-dev/camelot/pull/119) by [Jens Diemer](https://github.com/jedie). + 0.8.2 (2020-07-27) ------------------ From 2c59e7b0f7632b2dc4df15b50eb60dcf6b084945 Mon Sep 17 00:00:00 2001 From: Vinayak Mehta Date: Tue, 15 Jun 2021 03:29:35 +0530 Subject: [PATCH 07/18] Blacken code --- camelot/core.py | 22 +- camelot/ext/ghostscript/__init__.py | 3 +- camelot/handlers.py | 4 +- camelot/parsers/base.py | 3 +- camelot/parsers/stream.py | 10 +- camelot/plotting.py | 10 +- camelot/utils.py | 4 +- docs/_themes/flask_theme_support.py | 145 ++++---- docs/conf.py | 120 ++++--- setup.py | 101 +++--- tests/data.py | 508 +++++++++++++++++++++++++--- 11 files changed, 676 insertions(+), 254 deletions(-) diff --git a/camelot/core.py b/camelot/core.py index fdc2ae3..65fd1a6 100644 --- a/camelot/core.py +++ b/camelot/core.py @@ -55,7 +55,9 @@ class TextEdge(object): x = round(self.x, 2) y0 = round(self.y0, 2) y1 = round(self.y1, 2) - return f"" + return ( + f"" + ) def update_coords(self, x, y0, edge_tol=50): """Updates the text edge's x and bottom y coordinates and sets @@ -102,8 +104,7 @@ class TextEdges(object): return None def add(self, textline, align): - """Adds a new text edge to the current dict. - """ + """Adds a new text edge to the current dict.""" x = self.get_x_coord(textline, align) y0 = textline.y0 y1 = textline.y1 @@ -111,8 +112,7 @@ class TextEdges(object): self._textedges[align].append(te) def update(self, textline): - """Updates an existing text edge in the current dict. - """ + """Updates an existing text edge in the current dict.""" for align in ["left", "right", "middle"]: x_coord = self.get_x_coord(textline, align) idx = self.find(x_coord, align) @@ -304,8 +304,7 @@ class Cell(object): @property def bound(self): - """The number of sides on which the cell is bounded. - """ + """The number of sides on which the cell is bounded.""" return self.top + self.bottom + self.left + self.right @@ -361,8 +360,7 @@ class Table(object): @property def data(self): - """Returns two-dimensional list of strings in table. - """ + """Returns two-dimensional list of strings in table.""" d = [] for row in self.cells: d.append([cell.text.strip() for cell in row]) @@ -383,8 +381,7 @@ class Table(object): return report def set_all_edges(self): - """Sets all table edges to True. - """ + """Sets all table edges to True.""" for row in self.cells: for cell in row: cell.left = cell.right = cell.top = cell.bottom = True @@ -526,8 +523,7 @@ class Table(object): return self def set_border(self): - """Sets table border edges to True. - """ + """Sets table border edges to True.""" for r in range(len(self.rows)): self.cells[r][0].left = True self.cells[r][len(self.cols) - 1].right = True diff --git a/camelot/ext/ghostscript/__init__.py b/camelot/ext/ghostscript/__init__.py index 1b4ec48..5816475 100644 --- a/camelot/ext/ghostscript/__init__.py +++ b/camelot/ext/ghostscript/__init__.py @@ -81,8 +81,7 @@ class __Ghostscript(object): def Ghostscript(*args, **kwargs): - """Factory function for setting up a Ghostscript instance - """ + """Factory function for setting up a Ghostscript instance""" global __instance__ # Ghostscript only supports a single instance if __instance__ is None: diff --git a/camelot/handlers.py b/camelot/handlers.py index 9ec10bb..fb8d4b5 100644 --- a/camelot/handlers.py +++ b/camelot/handlers.py @@ -167,9 +167,7 @@ class PDFHandler(object): with TemporaryDirectory() as tempdir: for p in self.pages: self._save_page(self.filepath, p, tempdir) - pages = [ - os.path.join(tempdir, f"page-{p}.pdf") for p in self.pages - ] + pages = [os.path.join(tempdir, f"page-{p}.pdf") for p in self.pages] parser = Lattice(**kwargs) if flavor == "lattice" else Stream(**kwargs) for p in pages: t = parser.extract_tables( diff --git a/camelot/parsers/base.py b/camelot/parsers/base.py index cb1bc21..79be789 100644 --- a/camelot/parsers/base.py +++ b/camelot/parsers/base.py @@ -6,8 +6,7 @@ from ..utils import get_page_layout, get_text_objects class BaseParser(object): - """Defines a base parser. - """ + """Defines a base parser.""" def _generate_layout(self, filename, layout_kwargs): self.filename = filename diff --git a/camelot/parsers/stream.py b/camelot/parsers/stream.py index 39a0464..c7b21da 100644 --- a/camelot/parsers/stream.py +++ b/camelot/parsers/stream.py @@ -65,7 +65,7 @@ class Stream(BaseParser): edge_tol=50, row_tol=2, column_tol=0, - **kwargs + **kwargs, ): self.table_regions = table_regions self.table_areas = table_areas @@ -362,10 +362,10 @@ class Stream(BaseParser): if len(elements): ncols = max(set(elements), key=elements.count) else: - warnings.warn( - f"No tables found in table area {table_idx + 1}" - ) - cols = [(t.x0, t.x1) for r in rows_grouped if len(r) == ncols for t in r] + warnings.warn(f"No tables found in table area {table_idx + 1}") + cols = [ + (t.x0, t.x1) for r in rows_grouped if len(r) == ncols for t in r + ] cols = self._merge_columns(sorted(cols), column_tol=self.column_tol) inner_text = [] for i in range(1, len(cols)): diff --git a/camelot/plotting.py b/camelot/plotting.py index 473ad13..f5b6afe 100644 --- a/camelot/plotting.py +++ b/camelot/plotting.py @@ -34,13 +34,9 @@ class PlotMethods(object): raise ImportError("matplotlib is required for plotting.") if table.flavor == "lattice" and kind in ["textedge"]: - raise NotImplementedError( - f"Lattice flavor does not support kind='{kind}'" - ) + raise NotImplementedError(f"Lattice flavor does not support kind='{kind}'") elif table.flavor == "stream" and kind in ["joint", "line"]: - raise NotImplementedError( - f"Stream flavor does not support kind='{kind}'" - ) + raise NotImplementedError(f"Stream flavor does not support kind='{kind}'") plot_method = getattr(self, kind) fig = plot_method(table) @@ -48,7 +44,7 @@ class PlotMethods(object): if filename is not None: fig.savefig(filename) return None - + return fig def text(self, table): diff --git a/camelot/utils.py b/camelot/utils.py index 2126fbb..43e4312 100644 --- a/camelot/utils.py +++ b/camelot/utils.py @@ -870,7 +870,9 @@ def get_page_layout( parser = PDFParser(f) document = PDFDocument(parser) if not document.is_extractable: - raise PDFTextExtractionNotAllowed(f"Text extraction is not allowed: {filename}") + raise PDFTextExtractionNotAllowed( + f"Text extraction is not allowed: {filename}" + ) laparams = LAParams( char_margin=char_margin, line_margin=line_margin, diff --git a/docs/_themes/flask_theme_support.py b/docs/_themes/flask_theme_support.py index fde6274..90005c6 100644 --- a/docs/_themes/flask_theme_support.py +++ b/docs/_themes/flask_theme_support.py @@ -1,7 +1,19 @@ # flasky pygments style based on tango style from pygments.style import Style -from pygments.token import Keyword, Name, Comment, String, Error, \ - Number, Operator, Generic, Whitespace, Punctuation, Other, Literal +from pygments.token import ( + Keyword, + Name, + Comment, + String, + Error, + Number, + Operator, + Generic, + Whitespace, + Punctuation, + Other, + Literal, +) class FlaskyStyle(Style): @@ -11,76 +23,67 @@ class FlaskyStyle(Style): styles = { # No corresponding class for the following: # Text: "", # class: '' - Whitespace: "underline #f8f8f8", # class: 'w' - Error: "#a40000 border:#ef2929", # class: 'err' - Other: "#000000", # class 'x' - - Comment: "italic #8f5902", # class: 'c' - Comment.Preproc: "noitalic", # class: 'cp' - - Keyword: "bold #004461", # class: 'k' - Keyword.Constant: "bold #004461", # class: 'kc' - Keyword.Declaration: "bold #004461", # class: 'kd' - Keyword.Namespace: "bold #004461", # class: 'kn' - Keyword.Pseudo: "bold #004461", # class: 'kp' - Keyword.Reserved: "bold #004461", # class: 'kr' - Keyword.Type: "bold #004461", # class: 'kt' - - Operator: "#582800", # class: 'o' - Operator.Word: "bold #004461", # class: 'ow' - like keywords - - Punctuation: "bold #000000", # class: 'p' - + Whitespace: "underline #f8f8f8", # class: 'w' + Error: "#a40000 border:#ef2929", # class: 'err' + Other: "#000000", # class 'x' + Comment: "italic #8f5902", # class: 'c' + Comment.Preproc: "noitalic", # class: 'cp' + Keyword: "bold #004461", # class: 'k' + Keyword.Constant: "bold #004461", # class: 'kc' + Keyword.Declaration: "bold #004461", # class: 'kd' + Keyword.Namespace: "bold #004461", # class: 'kn' + Keyword.Pseudo: "bold #004461", # class: 'kp' + Keyword.Reserved: "bold #004461", # class: 'kr' + Keyword.Type: "bold #004461", # class: 'kt' + Operator: "#582800", # class: 'o' + Operator.Word: "bold #004461", # class: 'ow' - like keywords + Punctuation: "bold #000000", # class: 'p' # because special names such as Name.Class, Name.Function, etc. # are not recognized as such later in the parsing, we choose them # to look the same as ordinary variables. - Name: "#000000", # class: 'n' - Name.Attribute: "#c4a000", # class: 'na' - to be revised - Name.Builtin: "#004461", # class: 'nb' - Name.Builtin.Pseudo: "#3465a4", # class: 'bp' - Name.Class: "#000000", # class: 'nc' - to be revised - Name.Constant: "#000000", # class: 'no' - to be revised - Name.Decorator: "#888", # class: 'nd' - to be revised - Name.Entity: "#ce5c00", # class: 'ni' - Name.Exception: "bold #cc0000", # class: 'ne' - Name.Function: "#000000", # class: 'nf' - Name.Property: "#000000", # class: 'py' - Name.Label: "#f57900", # class: 'nl' - Name.Namespace: "#000000", # class: 'nn' - to be revised - Name.Other: "#000000", # class: 'nx' - Name.Tag: "bold #004461", # class: 'nt' - like a keyword - Name.Variable: "#000000", # class: 'nv' - to be revised - Name.Variable.Class: "#000000", # class: 'vc' - to be revised - Name.Variable.Global: "#000000", # class: 'vg' - to be revised - Name.Variable.Instance: "#000000", # class: 'vi' - to be revised - - Number: "#990000", # class: 'm' - - Literal: "#000000", # class: 'l' - Literal.Date: "#000000", # class: 'ld' - - String: "#4e9a06", # class: 's' - String.Backtick: "#4e9a06", # class: 'sb' - String.Char: "#4e9a06", # class: 'sc' - String.Doc: "italic #8f5902", # class: 'sd' - like a comment - String.Double: "#4e9a06", # class: 's2' - String.Escape: "#4e9a06", # class: 'se' - String.Heredoc: "#4e9a06", # class: 'sh' - String.Interpol: "#4e9a06", # class: 'si' - String.Other: "#4e9a06", # class: 'sx' - String.Regex: "#4e9a06", # class: 'sr' - String.Single: "#4e9a06", # class: 's1' - String.Symbol: "#4e9a06", # class: 'ss' - - Generic: "#000000", # class: 'g' - Generic.Deleted: "#a40000", # class: 'gd' - Generic.Emph: "italic #000000", # class: 'ge' - Generic.Error: "#ef2929", # class: 'gr' - Generic.Heading: "bold #000080", # class: 'gh' - Generic.Inserted: "#00A000", # class: 'gi' - Generic.Output: "#888", # class: 'go' - Generic.Prompt: "#745334", # class: 'gp' - Generic.Strong: "bold #000000", # class: 'gs' - Generic.Subheading: "bold #800080", # class: 'gu' - Generic.Traceback: "bold #a40000", # class: 'gt' + Name: "#000000", # class: 'n' + Name.Attribute: "#c4a000", # class: 'na' - to be revised + Name.Builtin: "#004461", # class: 'nb' + Name.Builtin.Pseudo: "#3465a4", # class: 'bp' + Name.Class: "#000000", # class: 'nc' - to be revised + Name.Constant: "#000000", # class: 'no' - to be revised + Name.Decorator: "#888", # class: 'nd' - to be revised + Name.Entity: "#ce5c00", # class: 'ni' + Name.Exception: "bold #cc0000", # class: 'ne' + Name.Function: "#000000", # class: 'nf' + Name.Property: "#000000", # class: 'py' + Name.Label: "#f57900", # class: 'nl' + Name.Namespace: "#000000", # class: 'nn' - to be revised + Name.Other: "#000000", # class: 'nx' + Name.Tag: "bold #004461", # class: 'nt' - like a keyword + Name.Variable: "#000000", # class: 'nv' - to be revised + Name.Variable.Class: "#000000", # class: 'vc' - to be revised + Name.Variable.Global: "#000000", # class: 'vg' - to be revised + Name.Variable.Instance: "#000000", # class: 'vi' - to be revised + Number: "#990000", # class: 'm' + Literal: "#000000", # class: 'l' + Literal.Date: "#000000", # class: 'ld' + String: "#4e9a06", # class: 's' + String.Backtick: "#4e9a06", # class: 'sb' + String.Char: "#4e9a06", # class: 'sc' + String.Doc: "italic #8f5902", # class: 'sd' - like a comment + String.Double: "#4e9a06", # class: 's2' + String.Escape: "#4e9a06", # class: 'se' + String.Heredoc: "#4e9a06", # class: 'sh' + String.Interpol: "#4e9a06", # class: 'si' + String.Other: "#4e9a06", # class: 'sx' + String.Regex: "#4e9a06", # class: 'sr' + String.Single: "#4e9a06", # class: 's1' + String.Symbol: "#4e9a06", # class: 'ss' + Generic: "#000000", # class: 'g' + Generic.Deleted: "#a40000", # class: 'gd' + Generic.Emph: "italic #000000", # class: 'ge' + Generic.Error: "#ef2929", # class: 'gr' + Generic.Heading: "bold #000080", # class: 'gh' + Generic.Inserted: "#00A000", # class: 'gi' + Generic.Output: "#888", # class: 'go' + Generic.Prompt: "#745334", # class: 'gp' + Generic.Strong: "bold #000000", # class: 'gs' + Generic.Subheading: "bold #800080", # class: 'gu' + Generic.Traceback: "bold #a40000", # class: 'gt' } diff --git a/docs/conf.py b/docs/conf.py index 018f393..7309ea5 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -22,8 +22,8 @@ import sys # sys.path.insert(0, os.path.abspath('..')) # Insert Camelot's path into the system. -sys.path.insert(0, os.path.abspath('..')) -sys.path.insert(0, os.path.abspath('_themes')) +sys.path.insert(0, os.path.abspath("..")) +sys.path.insert(0, os.path.abspath("_themes")) import camelot @@ -38,33 +38,33 @@ import camelot # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom # ones. extensions = [ - 'sphinx.ext.autodoc', - 'sphinx.ext.napoleon', - 'sphinx.ext.intersphinx', - 'sphinx.ext.todo', - 'sphinx.ext.viewcode', + "sphinx.ext.autodoc", + "sphinx.ext.napoleon", + "sphinx.ext.intersphinx", + "sphinx.ext.todo", + "sphinx.ext.viewcode", ] # Add any paths that contain templates here, relative to this directory. -templates_path = ['_templates'] +templates_path = ["_templates"] # The suffix(es) of source filenames. # You can specify multiple suffix as a list of string: # # source_suffix = ['.rst', '.md'] -source_suffix = '.rst' +source_suffix = ".rst" # The encoding of source files. # # source_encoding = 'utf-8-sig' # The master toctree document. -master_doc = 'index' +master_doc = "index" # General information about the project. -project = u'Camelot' -copyright = u'2020, Camelot Developers' -author = u'Vinayak Mehta' +project = u"Camelot" +copyright = u"2020, Camelot Developers" +author = u"Vinayak Mehta" # The version info for the project you're documenting, acts as replacement for # |version| and |release|, also used in various other places throughout the @@ -94,7 +94,7 @@ language = None # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. # This patterns also effect to html_static_path and html_extra_path -exclude_patterns = ['_build'] +exclude_patterns = ["_build"] # The reST default role (used for this markup: `text`) to use for all # documents. @@ -114,7 +114,7 @@ add_module_names = True # show_authors = False # The name of the Pygments (syntax highlighting) style to use. -pygments_style = 'flask_theme_support.FlaskyStyle' +pygments_style = "flask_theme_support.FlaskyStyle" # A list of ignored prefixes for module index sorting. # modindex_common_prefix = [] @@ -130,18 +130,18 @@ todo_include_todos = True # The theme to use for HTML and HTML Help pages. See the documentation for # a list of builtin themes. -html_theme = 'alabaster' +html_theme = "alabaster" # Theme options are theme-specific and customize the look and feel of a theme # further. For a list of options available for each theme, see the # documentation. html_theme_options = { - 'show_powered_by': False, - 'github_user': 'camelot-dev', - 'github_repo': 'camelot', - 'github_banner': True, - 'show_related': False, - 'note_bg': '#FFF59C' + "show_powered_by": False, + "github_user": "camelot-dev", + "github_repo": "camelot", + "github_banner": True, + "show_related": False, + "note_bg": "#FFF59C", } # Add any paths that contain custom themes here, relative to this directory. @@ -164,12 +164,12 @@ html_theme_options = { # The name of an image file (relative to this directory) to use as a favicon of # the docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 # pixels large. -html_favicon = '_static/favicon.ico' +html_favicon = "_static/favicon.ico" # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". -html_static_path = ['_static'] +html_static_path = ["_static"] # Add any extra paths that contain custom files (such as robots.txt or # .htaccess) here, relative to this directory. These files are copied @@ -189,10 +189,21 @@ html_use_smartypants = True # Custom sidebar templates, maps document names to template names. html_sidebars = { - 'index': ['sidebarintro.html', 'relations.html', 'sourcelink.html', - 'searchbox.html', 'hacks.html'], - '**': ['sidebarlogo.html', 'localtoc.html', 'relations.html', - 'sourcelink.html', 'searchbox.html', 'hacks.html'] + "index": [ + "sidebarintro.html", + "relations.html", + "sourcelink.html", + "searchbox.html", + "hacks.html", + ], + "**": [ + "sidebarlogo.html", + "localtoc.html", + "relations.html", + "sourcelink.html", + "searchbox.html", + "hacks.html", + ], } # Additional templates that should be rendered to pages, maps page names to @@ -249,34 +260,30 @@ html_show_copyright = True # html_search_scorer = 'scorer.js' # Output file base name for HTML help builder. -htmlhelp_basename = 'Camelotdoc' +htmlhelp_basename = "Camelotdoc" # -- Options for LaTeX output --------------------------------------------- latex_elements = { - # The paper size ('letterpaper' or 'a4paper'). - # - # 'papersize': 'letterpaper', - - # The font size ('10pt', '11pt' or '12pt'). - # - # 'pointsize': '10pt', - - # Additional stuff for the LaTeX preamble. - # - # 'preamble': '', - - # Latex figure (float) alignment - # - # 'figure_align': 'htbp', + # The paper size ('letterpaper' or 'a4paper'). + # + # 'papersize': 'letterpaper', + # The font size ('10pt', '11pt' or '12pt'). + # + # 'pointsize': '10pt', + # Additional stuff for the LaTeX preamble. + # + # 'preamble': '', + # Latex figure (float) alignment + # + # 'figure_align': 'htbp', } # Grouping the document tree into LaTeX files. List of tuples # (source start file, target name, title, # author, documentclass [howto, manual, or own class]). latex_documents = [ - (master_doc, 'Camelot.tex', u'Camelot Documentation', - u'Vinayak Mehta', 'manual'), + (master_doc, "Camelot.tex", u"Camelot Documentation", u"Vinayak Mehta", "manual"), ] # The name of an image file (relative to this directory) to place at the top of @@ -316,10 +323,7 @@ latex_documents = [ # One entry per manual page. List of tuples # (source start file, name, description, authors, manual section). -man_pages = [ - (master_doc, 'Camelot', u'Camelot Documentation', - [author], 1) -] +man_pages = [(master_doc, "Camelot", u"Camelot Documentation", [author], 1)] # If true, show URL addresses after external links. # @@ -332,9 +336,15 @@ man_pages = [ # (source start file, target name, title, author, # dir menu entry, description, category) texinfo_documents = [ - (master_doc, 'Camelot', u'Camelot Documentation', - author, 'Camelot', 'One line description of project.', - 'Miscellaneous'), + ( + master_doc, + "Camelot", + u"Camelot Documentation", + author, + "Camelot", + "One line description of project.", + "Miscellaneous", + ), ] # Documents to append as an appendix to all manuals. @@ -356,6 +366,6 @@ texinfo_documents = [ # Example configuration for intersphinx: refer to the Python standard library. intersphinx_mapping = { - 'https://docs.python.org/2': None, - 'http://pandas.pydata.org/pandas-docs/stable': None + "https://docs.python.org/2": None, + "http://pandas.pydata.org/pandas-docs/stable": None, } diff --git a/setup.py b/setup.py index b1ac666..c3a68d1 100644 --- a/setup.py +++ b/setup.py @@ -6,38 +6,36 @@ from setuptools import find_packages here = os.path.abspath(os.path.dirname(__file__)) about = {} -with open(os.path.join(here, 'camelot', '__version__.py'), 'r') as f: +with open(os.path.join(here, "camelot", "__version__.py"), "r") as f: exec(f.read(), about) -with open('README.md', 'r') as f: +with open("README.md", "r") as f: readme = f.read() requires = [ - 'chardet>=3.0.4', - 'click>=6.7', - 'numpy>=1.13.3', - 'openpyxl>=2.5.8', - 'pandas>=0.23.4', - 'pdfminer.six>=20200726', - 'PyPDF2>=1.26.0' + "chardet>=3.0.4", + "click>=6.7", + "numpy>=1.13.3", + "openpyxl>=2.5.8", + "pandas>=0.23.4", + "pdfminer.six>=20200726", + "PyPDF2>=1.26.0", ] -cv_requires = [ - 'opencv-python>=3.4.2.17' -] +cv_requires = ["opencv-python>=3.4.2.17"] plot_requires = [ - 'matplotlib>=2.2.3', + "matplotlib>=2.2.3", ] dev_requires = [ - 'codecov>=2.0.15', - 'pytest>=5.4.3', - 'pytest-cov>=2.10.0', - 'pytest-mpl>=0.11', - 'pytest-runner>=5.2', - 'Sphinx>=3.1.2' + "codecov>=2.0.15", + "pytest>=5.4.3", + "pytest-cov>=2.10.0", + "pytest-mpl>=0.11", + "pytest-runner>=5.2", + "Sphinx>=3.1.2", ] all_requires = cv_requires + plot_requires @@ -45,36 +43,39 @@ dev_requires = dev_requires + all_requires def setup_package(): - metadata = dict(name=about['__title__'], - version=about['__version__'], - description=about['__description__'], - long_description=readme, - long_description_content_type="text/markdown", - url=about['__url__'], - author=about['__author__'], - author_email=about['__author_email__'], - license=about['__license__'], - packages=find_packages(exclude=('tests',)), - install_requires=requires, - extras_require={ - 'all': all_requires, - 'cv': cv_requires, - 'dev': dev_requires, - 'plot': plot_requires - }, - entry_points={ - 'console_scripts': [ - 'camelot = camelot.cli:cli', - ], - }, - classifiers=[ - # Trove classifiers - # Full list: https://pypi.python.org/pypi?%3Aaction=list_classifiers - 'License :: OSI Approved :: MIT License', - 'Programming Language :: Python :: 3.6', - 'Programming Language :: Python :: 3.7', - 'Programming Language :: Python :: 3.8' - ]) + metadata = dict( + name=about["__title__"], + version=about["__version__"], + description=about["__description__"], + long_description=readme, + long_description_content_type="text/markdown", + url=about["__url__"], + author=about["__author__"], + author_email=about["__author_email__"], + license=about["__license__"], + packages=find_packages(exclude=("tests",)), + install_requires=requires, + extras_require={ + "all": all_requires, + "cv": cv_requires, + "dev": dev_requires, + "plot": plot_requires, + }, + entry_points={ + "console_scripts": [ + "camelot = camelot.cli:cli", + ], + }, + classifiers=[ + # Trove classifiers + # Full list: https://pypi.python.org/pypi?%3Aaction=list_classifiers + "License :: OSI Approved :: MIT License", + "Programming Language :: Python :: 3.6", + "Programming Language :: Python :: 3.7", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + ], + ) try: from setuptools import setup @@ -84,5 +85,5 @@ def setup_package(): setup(**metadata) -if __name__ == '__main__': +if __name__ == "__main__": setup_package() diff --git a/tests/data.py b/tests/data.py index b2bf706..835fbb4 100755 --- a/tests/data.py +++ b/tests/data.py @@ -2800,49 +2800,467 @@ data_stream_layout_kwargs = [ ] data_stream_duplicated_text = [ - ['', '2012 BETTER VARIETIES Harvest Report for Minnesota Central [ MNCE ]', '', '', '', '', '', '', '', '', - 'ALL SEASON TEST'], - ['', 'Doug Toreen, Renville County, MN 55310 [ BIRD ISLAND ]', '', '', '', '', '', '', '', '', - '1.3 - 2.0 MAT. GROUP'], - ['PREV. CROP/HERB:', 'Corn / Surpass, Roundup', '', '', '', '', '', '', '', '', 'S2MNCE01'], - ['SOIL DESCRIPTION:', '', 'Canisteo clay loam, mod. well drained, non-irrigated', '', '', '', '', '', '', '', ''], - ['SOIL CONDITIONS:', '', 'High P, high K, 6.7 pH, 3.9% OM, Low SCN', '', '', '', '', '', '', '', '30" ROW SPACING'], - ['TILLAGE/CULTIVATION:', 'conventional w/ fall till', '', '', '', '', '', '', '', '', ''], - ['PEST MANAGEMENT:', 'Roundup twice', '', '', '', '', '', '', '', '', ''], - ['SEEDED - RATE:', 'May 15', '140 000 /A', '', '', '', '', '', '', 'TOP 30 for YIELD of 63 TESTED', ''], - ['HARVESTED - STAND:', 'Oct 3', '122 921 /A', '', '', '', '', '', '', 'AVERAGE of (3) REPLICATIONS', ''], - ['', '', '', '', 'SCN', 'Seed', 'Yield', 'Moisture', 'Lodging', 'Stand', 'Gross'], - ['Company/Brand', 'Product/Brand†', 'Technol.†', 'Mat.', 'Resist.', 'Trmt.†', 'Bu/A', '%', '%', '(x 1000)', - 'Income'], ['Kruger', 'K2 1901', 'RR2Y', '1.9', 'R', 'Ac,PV', '56.4', '7.6', '0', '126.3', '$846'], - ['Stine', '19RA02 §', 'RR2Y', '1.9', 'R', 'CMB', '55.3', '7.6', '0', '120.0', '$830'], - ['Wensman', 'W 3190NR2', 'RR2Y', '1.9', 'R', 'Ac', '54.5', '7.6', '0', '119.5', '$818'], - ['Hefty', 'H17Y12', 'RR2Y', '1.7', 'MR', 'I', '53.7', '7.7', '0', '124.4', '$806'], - ['Dyna-Gro', 'S15RY53', 'RR2Y', '1.5', 'R', 'Ac', '53.6', '7.7', '0', '126.8', '$804'], - ['LG Seeds', 'C2050R2', 'RR2Y', '2.1', 'R', 'Ac', '53.6', '7.7', '0', '123.9', '$804'], - ['Titan Pro', '19M42', 'RR2Y', '1.9', 'R', 'CMB', '53.6', '7.7', '0', '121.0', '$804'], - ['Stine', '19RA02 (2) §', 'RR2Y', '1.9', 'R', 'CMB', '53.4', '7.7', '0', '123.9', '$801'], - ['Asgrow', 'AG1832 §', 'RR2Y', '1.8', 'MR', 'Ac,PV', '52.9', '7.7', '0', '122.0', '$794'], - ['Prairie Brand', 'PB-1566R2', 'RR2Y', '1.5', 'R', 'CMB', '52.8', '7.7', '0', '122.9', '$792'], - ['Channel', '1901R2', 'RR2Y', '1.9', 'R', 'Ac,PV', '52.8', '7.6', '0', '123.4', '$791'], - ['Titan Pro', '20M1', 'RR2Y', '2.0', 'R', 'Am', '52.5', '7.5', '0', '124.4', '$788'], - ['Kruger', 'K2-2002', 'RR2Y', '2.0', 'R', 'Ac,PV', '52.4', '7.9', '0', '125.4', '$786'], - ['Channel', '1700R2', 'RR2Y', '1.7', 'R', 'Ac,PV', '52.3', '7.9', '0', '123.9', '$784'], - ['Hefty', 'H16Y11', 'RR2Y', '1.6', 'MR', 'I', '51.4', '7.6', '0', '123.9', '$771'], - ['Anderson', '162R2Y', 'RR2Y', '1.6', 'R', 'None', '51.3', '7.5', '0', '119.5', '$770'], - ['Titan Pro', '15M22', 'RR2Y', '1.5', 'R', 'CMB', '51.3', '7.8', '0', '125.4', '$769'], - ['Dairyland', 'DSR-1710R2Y', 'RR2Y', '1.7', 'R', 'CMB', '51.3', '7.7', '0', '122.0', '$769'], - ['Hefty', 'H20R3', 'RR2Y', '2.0', 'MR', 'I', '50.5', '8.2', '0', '121.0', '$757'], - ['Prairie Brand', 'PB 1743R2', 'RR2Y', '1.7', 'R', 'CMB', '50.2', '7.7', '0', '125.8', '$752'], - ['Gold Country', '1741', 'RR2Y', '1.7', 'R', 'Ac', '50.1', '7.8', '0', '123.9', '$751'], - ['Trelay', '20RR43', 'RR2Y', '2.0', 'R', 'Ac,Ex', '49.9', '7.6', '0', '127.8', '$749'], - ['Hefty', 'H14R3', 'RR2Y', '1.4', 'MR', 'I', '49.7', '7.7', '0', '122.9', '$746'], - ['Prairie Brand', 'PB-2099NRR2', 'RR2Y', '2.0', 'R', 'CMB', '49.6', '7.8', '0', '126.3', '$743'], - ['Wensman', 'W 3174NR2', 'RR2Y', '1.7', 'R', 'Ac', '49.3', '7.6', '0', '122.5', '$740'], - ['Kruger', 'K2 1602', 'RR2Y', '1.6', 'R', 'Ac,PV', '48.7', '7.6', '0', '125.4', '$731'], - ['NK Brand', 'S18-C2 §', 'RR2Y', '1.8', 'R', 'CMB', '48.7', '7.7', '0', '126.8', '$731'], - ['Kruger', 'K2 1902', 'RR2Y', '1.9', 'R', 'Ac,PV', '48.7', '7.5', '0', '124.4', '$730'], - ['Prairie Brand', 'PB-1823R2', 'RR2Y', '1.8', 'R', 'None', '48.5', '7.6', '0', '121.0', '$727'], - ['Gold Country', '1541', 'RR2Y', '1.5', 'R', 'Ac', '48.4', '7.6', '0', '110.4', '$726'], - ['', '', '', '', '', 'Test Average =', '47.6', '7.7', '0', '122.9', '$713'], - ['', '', '', '', '', 'LSD (0.10) =', '5.7', '0.3', 'ns', '37.8', '566.4'] + [ + "", + "2012 BETTER VARIETIES Harvest Report for Minnesota Central [ MNCE ]", + "", + "", + "", + "", + "", + "", + "", + "", + "ALL SEASON TEST", + ], + [ + "", + "Doug Toreen, Renville County, MN 55310 [ BIRD ISLAND ]", + "", + "", + "", + "", + "", + "", + "", + "", + "1.3 - 2.0 MAT. GROUP", + ], + [ + "PREV. CROP/HERB:", + "Corn / Surpass, Roundup", + "", + "", + "", + "", + "", + "", + "", + "", + "S2MNCE01", + ], + [ + "SOIL DESCRIPTION:", + "", + "Canisteo clay loam, mod. well drained, non-irrigated", + "", + "", + "", + "", + "", + "", + "", + "", + ], + [ + "SOIL CONDITIONS:", + "", + "High P, high K, 6.7 pH, 3.9% OM, Low SCN", + "", + "", + "", + "", + "", + "", + "", + '30" ROW SPACING', + ], + [ + "TILLAGE/CULTIVATION:", + "conventional w/ fall till", + "", + "", + "", + "", + "", + "", + "", + "", + "", + ], + ["PEST MANAGEMENT:", "Roundup twice", "", "", "", "", "", "", "", "", ""], + [ + "SEEDED - RATE:", + "May 15", + "140 000 /A", + "", + "", + "", + "", + "", + "", + "TOP 30 for YIELD of 63 TESTED", + "", + ], + [ + "HARVESTED - STAND:", + "Oct 3", + "122 921 /A", + "", + "", + "", + "", + "", + "", + "AVERAGE of (3) REPLICATIONS", + "", + ], + ["", "", "", "", "SCN", "Seed", "Yield", "Moisture", "Lodging", "Stand", "Gross"], + [ + "Company/Brand", + "Product/Brand†", + "Technol.†", + "Mat.", + "Resist.", + "Trmt.†", + "Bu/A", + "%", + "%", + "(x 1000)", + "Income", + ], + [ + "Kruger", + "K2 1901", + "RR2Y", + "1.9", + "R", + "Ac,PV", + "56.4", + "7.6", + "0", + "126.3", + "$846", + ], + [ + "Stine", + "19RA02 §", + "RR2Y", + "1.9", + "R", + "CMB", + "55.3", + "7.6", + "0", + "120.0", + "$830", + ], + [ + "Wensman", + "W 3190NR2", + "RR2Y", + "1.9", + "R", + "Ac", + "54.5", + "7.6", + "0", + "119.5", + "$818", + ], + ["Hefty", "H17Y12", "RR2Y", "1.7", "MR", "I", "53.7", "7.7", "0", "124.4", "$806"], + [ + "Dyna-Gro", + "S15RY53", + "RR2Y", + "1.5", + "R", + "Ac", + "53.6", + "7.7", + "0", + "126.8", + "$804", + ], + [ + "LG Seeds", + "C2050R2", + "RR2Y", + "2.1", + "R", + "Ac", + "53.6", + "7.7", + "0", + "123.9", + "$804", + ], + [ + "Titan Pro", + "19M42", + "RR2Y", + "1.9", + "R", + "CMB", + "53.6", + "7.7", + "0", + "121.0", + "$804", + ], + [ + "Stine", + "19RA02 (2) §", + "RR2Y", + "1.9", + "R", + "CMB", + "53.4", + "7.7", + "0", + "123.9", + "$801", + ], + [ + "Asgrow", + "AG1832 §", + "RR2Y", + "1.8", + "MR", + "Ac,PV", + "52.9", + "7.7", + "0", + "122.0", + "$794", + ], + [ + "Prairie Brand", + "PB-1566R2", + "RR2Y", + "1.5", + "R", + "CMB", + "52.8", + "7.7", + "0", + "122.9", + "$792", + ], + [ + "Channel", + "1901R2", + "RR2Y", + "1.9", + "R", + "Ac,PV", + "52.8", + "7.6", + "0", + "123.4", + "$791", + ], + [ + "Titan Pro", + "20M1", + "RR2Y", + "2.0", + "R", + "Am", + "52.5", + "7.5", + "0", + "124.4", + "$788", + ], + [ + "Kruger", + "K2-2002", + "RR2Y", + "2.0", + "R", + "Ac,PV", + "52.4", + "7.9", + "0", + "125.4", + "$786", + ], + [ + "Channel", + "1700R2", + "RR2Y", + "1.7", + "R", + "Ac,PV", + "52.3", + "7.9", + "0", + "123.9", + "$784", + ], + ["Hefty", "H16Y11", "RR2Y", "1.6", "MR", "I", "51.4", "7.6", "0", "123.9", "$771"], + [ + "Anderson", + "162R2Y", + "RR2Y", + "1.6", + "R", + "None", + "51.3", + "7.5", + "0", + "119.5", + "$770", + ], + [ + "Titan Pro", + "15M22", + "RR2Y", + "1.5", + "R", + "CMB", + "51.3", + "7.8", + "0", + "125.4", + "$769", + ], + [ + "Dairyland", + "DSR-1710R2Y", + "RR2Y", + "1.7", + "R", + "CMB", + "51.3", + "7.7", + "0", + "122.0", + "$769", + ], + ["Hefty", "H20R3", "RR2Y", "2.0", "MR", "I", "50.5", "8.2", "0", "121.0", "$757"], + [ + "Prairie Brand", + "PB 1743R2", + "RR2Y", + "1.7", + "R", + "CMB", + "50.2", + "7.7", + "0", + "125.8", + "$752", + ], + [ + "Gold Country", + "1741", + "RR2Y", + "1.7", + "R", + "Ac", + "50.1", + "7.8", + "0", + "123.9", + "$751", + ], + [ + "Trelay", + "20RR43", + "RR2Y", + "2.0", + "R", + "Ac,Ex", + "49.9", + "7.6", + "0", + "127.8", + "$749", + ], + ["Hefty", "H14R3", "RR2Y", "1.4", "MR", "I", "49.7", "7.7", "0", "122.9", "$746"], + [ + "Prairie Brand", + "PB-2099NRR2", + "RR2Y", + "2.0", + "R", + "CMB", + "49.6", + "7.8", + "0", + "126.3", + "$743", + ], + [ + "Wensman", + "W 3174NR2", + "RR2Y", + "1.7", + "R", + "Ac", + "49.3", + "7.6", + "0", + "122.5", + "$740", + ], + [ + "Kruger", + "K2 1602", + "RR2Y", + "1.6", + "R", + "Ac,PV", + "48.7", + "7.6", + "0", + "125.4", + "$731", + ], + [ + "NK Brand", + "S18-C2 §", + "RR2Y", + "1.8", + "R", + "CMB", + "48.7", + "7.7", + "0", + "126.8", + "$731", + ], + [ + "Kruger", + "K2 1902", + "RR2Y", + "1.9", + "R", + "Ac,PV", + "48.7", + "7.5", + "0", + "124.4", + "$730", + ], + [ + "Prairie Brand", + "PB-1823R2", + "RR2Y", + "1.8", + "R", + "None", + "48.5", + "7.6", + "0", + "121.0", + "$727", + ], + [ + "Gold Country", + "1541", + "RR2Y", + "1.5", + "R", + "Ac", + "48.4", + "7.6", + "0", + "110.4", + "$726", + ], + ["", "", "", "", "", "Test Average =", "47.6", "7.7", "0", "122.9", "$713"], + ["", "", "", "", "", "LSD (0.10) =", "5.7", "0.3", "ns", "37.8", "566.4"], ] From 021be79bf738f75b4e25381ba1fccdf0600da049 Mon Sep 17 00:00:00 2001 From: Vinayak Mehta Date: Tue, 15 Jun 2021 03:30:34 +0530 Subject: [PATCH 08/18] Fix README --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index a3f7f88..2e9ed6f 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -tests

+

From 3a8f988740a9a89760a3072aec8eba44763afebe Mon Sep 17 00:00:00 2001 From: Tiago Samaha Cordeiro Date: Wed, 24 Mar 2021 11:22:19 -0300 Subject: [PATCH 09/18] use resolution argument to generate image with GS --- camelot/parsers/lattice.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/camelot/parsers/lattice.py b/camelot/parsers/lattice.py index 5469fac..3fd5bbe 100644 --- a/camelot/parsers/lattice.py +++ b/camelot/parsers/lattice.py @@ -211,8 +211,8 @@ class Lattice(BaseParser): from ..ext.ghostscript import Ghostscript self.imagename = "".join([self.rootname, ".png"]) - gs_call = "-q -sDEVICE=png16m -o {} -r300 {}".format( - self.imagename, self.filename + gs_call = "-q -sDEVICE=png16m -o {} -r{} {}".format( + self.imagename, self.resolution, self.filename ) gs_call = gs_call.encode().split() null = open(os.devnull, "wb") From f53be3c73ed9f4203b1926241e800e00eb4424c4 Mon Sep 17 00:00:00 2001 From: Vinayak Mehta Date: Tue, 15 Jun 2021 03:53:23 +0530 Subject: [PATCH 10/18] Update HISTORY.md --- HISTORY.md | 1 + 1 file changed, 1 insertion(+) diff --git a/HISTORY.md b/HISTORY.md index 2fd1122..6b4308f 100755 --- a/HISTORY.md +++ b/HISTORY.md @@ -6,6 +6,7 @@ master **Bugfixes** +- Fix use of resolution argument to generate image with ghostscript. [#231](https://github.com/camelot-dev/camelot/pull/231) by [Tiago Samaha Cordeiro](https://github.com/tiagosamaha). - [#15](https://github.com/camelot-dev/camelot/issues/15) Fix duplicate strings being assigned to the same cell. [#206](https://github.com/camelot-dev/camelot/pull/206) by [Eduardo Gonzalez Lopez de Murillas](https://github.com/edugonza). - Save plot when filename is specified. [#121](https://github.com/camelot-dev/camelot/pull/121) by [Jens Diemer](https://github.com/jedie). - Close file streams explicitly. [#202](https://github.com/camelot-dev/camelot/pull/202) by [Martin Abente Lahaye](https://github.com/tchx84). From 9a3865c716ff67230b31a9f30f985ffd4a47961a Mon Sep 17 00:00:00 2001 From: Vinayak Mehta Date: Tue, 15 Jun 2021 03:55:46 +0530 Subject: [PATCH 11/18] Update HISTORY.md --- HISTORY.md | 1 + 1 file changed, 1 insertion(+) diff --git a/HISTORY.md b/HISTORY.md index 6b4308f..d682e82 100755 --- a/HISTORY.md +++ b/HISTORY.md @@ -16,6 +16,7 @@ master **Improvements** +- Add `line_overlap` and `boxes_flow` to `LAParams`. [#219](https://github.com/camelot-dev/camelot/pull/219) by [Arnie97](https://github.com/Arnie97). - [Add bug report template.](https://github.com/camelot-dev/camelot/commit/0a3944e54d133b701edfe9c7546ff11289301ba8) - Move from [Travis to GitHub Actions](https://github.com/camelot-dev/camelot/pull/241). - Update `.readthedocs.yml` and [remove requirements.txt](https://github.com/camelot-dev/camelot/commit/7ab5db39d07baa4063f975e9e00f6073340e04c1#diff-cde814ef2f549dc093f5b8fc533b7e8f47e7b32a8081e0760e57d5c25a1139d9) From c647f573d89d45b87119554c76b4f8234ea63f25 Mon Sep 17 00:00:00 2001 From: Vinayak Mehta Date: Tue, 15 Jun 2021 03:58:30 +0530 Subject: [PATCH 12/18] Bump version --- HISTORY.md | 3 +++ LICENSE | 2 +- camelot/__version__.py | 2 +- docs/conf.py | 2 +- 4 files changed, 6 insertions(+), 3 deletions(-) diff --git a/HISTORY.md b/HISTORY.md index d682e82..0657834 100755 --- a/HISTORY.md +++ b/HISTORY.md @@ -4,6 +4,9 @@ Release History master ------ +0.9.0 (2021-06-15) +------------------ + **Bugfixes** - Fix use of resolution argument to generate image with ghostscript. [#231](https://github.com/camelot-dev/camelot/pull/231) by [Tiago Samaha Cordeiro](https://github.com/tiagosamaha). diff --git a/LICENSE b/LICENSE index da379bb..2435efa 100644 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,6 @@ MIT License -Copyright (c) 2019-2020 Camelot Developers +Copyright (c) 2019-2021 Camelot Developers Copyright (c) 2018-2019 Peeply Private Ltd (Singapore) Permission is hereby granted, free of charge, to any person obtaining a copy diff --git a/camelot/__version__.py b/camelot/__version__.py index 945ce1a..ae0cab1 100644 --- a/camelot/__version__.py +++ b/camelot/__version__.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -VERSION = (0, 8, 2) +VERSION = (0, 9, 0) PRERELEASE = None # alpha, beta or rc REVISION = None diff --git a/docs/conf.py b/docs/conf.py index 7309ea5..e338412 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -63,7 +63,7 @@ master_doc = "index" # General information about the project. project = u"Camelot" -copyright = u"2020, Camelot Developers" +copyright = u"2021, Camelot Developers" author = u"Vinayak Mehta" # The version info for the project you're documenting, acts as replacement for From 14e5569a6706b30808fff4ad41351f4fe18082fc Mon Sep 17 00:00:00 2001 From: Vinayak Mehta Date: Sun, 27 Jun 2021 23:16:21 +0530 Subject: [PATCH 13/18] Update bug report template --- .github/ISSUE_TEMPLATE/bug_report.md | 41 +++++++++++++++++----------- 1 file changed, 25 insertions(+), 16 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md index c45e34f..1d39ef1 100644 --- a/.github/ISSUE_TEMPLATE/bug_report.md +++ b/.github/ISSUE_TEMPLATE/bug_report.md @@ -10,20 +10,25 @@ assignees: '' **Describe the bug** -A clear and concise description of what the bug is. + + **Steps to reproduce the bug** -Steps used to install `camelot`: -1. Add step here (you can add more steps too) -Steps to reproduce the behavior: -1. Add step here (you can add more steps too) + + + **Expected behavior** -A clear and concise description of what you expected to happen. + + **Code** -Add the Camelot code snippet that you used. + + + ``` import camelot @@ -31,18 +36,22 @@ import camelot ``` **PDF** -Add the PDF file that you want to extract tables from. + + **Screenshots** -If applicable, add screenshots to help explain your problem. + + **Environment** - - OS: [e.g. MacOS] - - Python version: - - Numpy version: - - OpenCV version: - - Ghostscript version: - - Camelot version: + +- OS: [e.g. macOS] +- Python version: +- Numpy version: +- OpenCV version: +- Ghostscript version: +- Camelot version: **Additional context** -Add any other context about the problem here. + + From cbda72ed540a324cd6e620036a928b79c6f0c8a0 Mon Sep 17 00:00:00 2001 From: Vinayak Mehta Date: Sun, 27 Jun 2021 23:49:25 +0530 Subject: [PATCH 14/18] Fix #229: Update installs-deps.rst --- docs/user/install-deps.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/user/install-deps.rst b/docs/user/install-deps.rst index 461a1d3..f4ab87d 100755 --- a/docs/user/install-deps.rst +++ b/docs/user/install-deps.rst @@ -43,8 +43,9 @@ For Ubuntu/MacOS:: For Windows:: + >>> import ctypes >>> from ctypes.util import find_library - >>> find_library("".join(("gsdll", str(ctypes.sizeof(ctypes.c_voidp) * 8), ".dll")) + >>> find_library("".join(("gsdll", str(ctypes.sizeof(ctypes.c_voidp) * 8), ".dll"))) **Check:** The output of the ``find_library`` function should not be empty. From 2aaa913c401e9d2dd7f599aa9e4c2b1c19e943d7 Mon Sep 17 00:00:00 2001 From: Vinayak Mehta Date: Mon, 28 Jun 2021 00:15:43 +0530 Subject: [PATCH 15/18] Update faq --- docs/index.rst | 1 + docs/user/faq.rst | 61 +++++++++++++++++++++++++---------------------- 2 files changed, 34 insertions(+), 28 deletions(-) diff --git a/docs/index.rst b/docs/index.rst index c3e1de4..b15fe33 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -109,6 +109,7 @@ This part of the documentation begins with some background information about why user/install user/how-it-works user/quickstart + user/faq user/advanced user/cli diff --git a/docs/user/faq.rst b/docs/user/faq.rst index 8f3b59e..8081083 100644 --- a/docs/user/faq.rst +++ b/docs/user/faq.rst @@ -1,46 +1,51 @@ .. _faq: -FAQ -=== - -This part of the documentation answers some common questions. If you want to add some questions you can simply open an issue `here `_. +Frequently Asked Questions +========================== +This part of the documentation answers some common questions. To add questions, please open an issue `here `_. How to reduce memory usage for long PDFs? ---------------------------------------------------- +----------------------------------------- During table extraction from long PDF documents, RAM usage can grow significantly. - -A simple workaround is to divide the extraction into some chunks (for example, chunks of 50 pages); at the end of every chunk extraction, data are saved to disk. -For more information, refer to this code snippet from `@anakin87 `_: +A simple workaround is to divide the extraction into chunks, and save extracted data to disk at the end of every chunk. -.. code-block:: python3 +For more details, check out this code snippet from `@anakin87 `_: + +:: import camelot - + + def chunks(l, n): """Yield successive n-sized chunks from l.""" for i in range(0, len(l), n): - yield l[i:i + n] - - def extract_tables_with_less_memory_usage(filepath, pages, params={}, - export_path='.', chunks_length=50): + yield l[i : i + n] + + + def extract_tables(filepath, pages, chunks=50, export_path=".", params={}): """ - Control page number - and subdivide the extraction work into n-pages parts (chunks_length). - At the end of every part, save the data on disk and free ram + Divide the extraction work into n chunks. At the end of every chunk, + save data on disk and free RAM. + + filepath : str + Filepath or URL of the PDF file. + pages : str, optional (default: '1') + Comma-separated page numbers. + Example: '1,3,4' or '1,4-end' or 'all'. """ - - # get list of document pages from Camelot handler - handler=camelot.handlers.PDFHandler(filepath) - pages_list=handler._get_pages(filepath,pages=pages) - + + # get list of pages from camelot.handlers.PDFHandler + handler = camelot.handlers.PDFHandler(filepath) + page_list = handler._get_pages(filepath, pages=pages) + # chunk pages list - pages_chunks=list(chunks(pages_list,chunks_length)) - + page_chunks = list(chunks(page_list, chunks)) + # extraction and export - for chunk in pages_chunks: - pages_string=str(chunk).replace('[','').replace(']','') - tables = camelot.read_pdf(filepath, pages=pages_string,**params) - tables.export(f'{export_path}/tables.json',f='json') + for chunk in page_chunks: + pages_string = str(chunk).replace("[", "").replace("]", "") + tables = camelot.read_pdf(filepath, pages=pages_string, **params) + tables.export(f"{export_path}/tables.csv") From 1f54108f114077de9f33d7539282cf86ba68af9d Mon Sep 17 00:00:00 2001 From: Vinayak Mehta Date: Mon, 28 Jun 2021 00:17:04 +0530 Subject: [PATCH 16/18] Update dev deps --- setup.py | 1 + 1 file changed, 1 insertion(+) diff --git a/setup.py b/setup.py index c3a68d1..1e1b4e6 100644 --- a/setup.py +++ b/setup.py @@ -36,6 +36,7 @@ dev_requires = [ "pytest-mpl>=0.11", "pytest-runner>=5.2", "Sphinx>=3.1.2", + "sphinx-autobuild>=2021.3.14", ] all_requires = cv_requires + plot_requires From 3d1c16ca3f218b7faf732ac261706b4e3e3b9716 Mon Sep 17 00:00:00 2001 From: Vinayak Mehta Date: Mon, 28 Jun 2021 00:26:09 +0530 Subject: [PATCH 17/18] Update README and HISTORY --- HISTORY.md | 2 ++ README.md | 6 ++++-- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/HISTORY.md b/HISTORY.md index 0657834..42a3c41 100755 --- a/HISTORY.md +++ b/HISTORY.md @@ -4,6 +4,8 @@ Release History master ------ +- Add faq section. [#216](https://github.com/camelot-dev/camelot/pull/216) by [Stefano Fiorucci](https://github.com/anakin87). + 0.9.0 (2021-06-15) ------------------ diff --git a/README.md b/README.md index 2e9ed6f..81fd71e 100644 --- a/README.md +++ b/README.md @@ -49,10 +49,12 @@ Camelot also comes packaged with a [command-line interface](https://camelot-py.r **Note:** Camelot only works with text-based PDFs and not scanned documents. (As Tabula [explains](https://github.com/tabulapdf/tabula#why-tabula), "If you can click and drag to select text in your table in a PDF viewer, then your PDF is text-based".) +You can check out some frequently asked questions [here](https://camelot-py.readthedocs.io/en/master/user/faq.html). + ## Why Camelot? -- **Configurability**: Camelot gives you control over the table extraction process with its [tweakable settings](https://camelot-py.readthedocs.io/en/master/user/advanced.html). -- **Metrics**: Bad tables can be discarded based on metrics like accuracy and whitespace, without having to manually look at each table. +- **Configurability**: Camelot gives you control over the table extraction process with [tweakable settings](https://camelot-py.readthedocs.io/en/master/user/advanced.html). +- **Metrics**: You can discard bad tables based on metrics like accuracy and whitespace, without having to manually look at each table. - **Output**: Each table is extracted into a **pandas DataFrame**, which seamlessly integrates into [ETL and data analysis workflows](https://gist.github.com/vinayak-mehta/e5949f7c2410a0e12f25d3682dc9e873). You can also export tables to multiple formats, which include CSV, JSON, Excel, HTML and Sqlite. See [comparison with similar libraries and tools](https://github.com/camelot-dev/camelot/wiki/Comparison-with-other-PDF-Table-Extraction-libraries-and-tools). From 216ec3c90b77df95aa4192ddc393527d54b6e33f Mon Sep 17 00:00:00 2001 From: Vinayak Mehta Date: Mon, 28 Jun 2021 00:28:35 +0530 Subject: [PATCH 18/18] Add faq --- docs/index.rst | 2 +- docs/user/faq.rst | 5 +++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/docs/index.rst b/docs/index.rst index b15fe33..65376b7 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -109,8 +109,8 @@ This part of the documentation begins with some background information about why user/install user/how-it-works user/quickstart - user/faq user/advanced + user/faq user/cli The API Documentation/Guide diff --git a/docs/user/faq.rst b/docs/user/faq.rst index 8081083..29bbdad 100644 --- a/docs/user/faq.rst +++ b/docs/user/faq.rst @@ -5,6 +5,11 @@ Frequently Asked Questions This part of the documentation answers some common questions. To add questions, please open an issue `here `_. +Does Camelot work with image-based PDFs? +---------------------------------------- + +**No**, Camelot only works with text-based PDFs and not scanned documents. (As Tabula `explains `_, "If you can click and drag to select text in your table in a PDF viewer, then your PDF is text-based".) + How to reduce memory usage for long PDFs? -----------------------------------------