diff --git a/.bandit b/.bandit new file mode 100644 index 0000000..c936150 --- /dev/null +++ b/.bandit @@ -0,0 +1,3 @@ +[bandit] +# Ignore concerns about asserts, necessary for unit test code +skips: B101,B102 diff --git a/.gitignore b/.gitignore index d0aea62..3af88c1 100644 --- a/.gitignore +++ b/.gitignore @@ -4,6 +4,7 @@ __pycache__/ build/ dist/ +prof/ *.egg-info/ .eggs/ .coverage @@ -17,3 +18,5 @@ htmlcov/ # vscode .vscode + +.DS_Store \ No newline at end of file diff --git a/.travis.yml b/.travis.yml index 7426bb0..c603fd5 100755 --- a/.travis.yml +++ b/.travis.yml @@ -1,4 +1,3 @@ -sudo: true language: python cache: pip addons: @@ -8,10 +7,6 @@ install: - make install jobs: include: - - stage: test - script: - - make test - python: '2.7' - stage: test script: - make test diff --git a/camelot/core.py b/camelot/core.py index b7a02b1..5fff3c6 100644 --- a/camelot/core.py +++ b/camelot/core.py @@ -38,7 +38,7 @@ class TextEdge(object): intersections: int Number of intersections with horizontal text rows. is_valid: bool - A text edge is valid if it intersections with at least + A text edge is valid if it intersects with at least TEXTEDGE_REQUIRED_ELEMENTS horizontal text rows. """ @@ -65,7 +65,8 @@ class TextEdge(object): the is_valid attribute. """ if np.isclose(self.y0, y0, atol=edge_tol): - self.x = (self.intersections * self.x + x) / float(self.intersections + 1) + self.x = (self.intersections * self.x + x) / \ + float(self.intersections + 1) self.y0 = y0 self.intersections += 1 # a textedge is valid only if it extends uninterrupted @@ -141,13 +142,16 @@ class TextEdges(object): """ intersections_sum = { "left": sum( - te.intersections for te in self._textedges["left"] if te.is_valid + te.intersections for te in self._textedges["left"] + if te.is_valid ), "right": sum( - te.intersections for te in self._textedges["right"] if te.is_valid + te.intersections for te in self._textedges["right"] + if te.is_valid ), "middle": sum( - te.intersections for te in self._textedges["middle"] if te.is_valid + te.intersections for te in self._textedges["middle"] + if te.is_valid ), } @@ -292,7 +296,10 @@ class Cell(object): def __repr__(self): return "".format( - round(self.x1, 2), round(self.y1, 2), round(self.x2, 2), round(self.y2, 2) + round(self.x1, 2), + round(self.y1, 2), + round(self.x2, 2), + round(self.y2, 2) ) @property @@ -342,7 +349,9 @@ class Table(object): def __init__(self, cols, rows): self.cols = cols self.rows = rows - self.cells = [[Cell(c[0], r[1], c[1], r[0]) for c in cols] for r in rows] + self.cells = [ + [Cell(c[0], r[1], c[1], r[0]) for c in cols] for r in rows + ] self.df = None self.shape = (0, 0) self.accuracy = 0 @@ -579,7 +588,8 @@ class Table(object): Output filepath. """ - kw = {"encoding": "utf-8", "index": False, "header": False, "quoting": 1} + kw = {"encoding": "utf-8", "index": False, "header": False, + "quoting": 1} kw.update(kwargs) self.df.to_csv(path, **kw) @@ -616,6 +626,7 @@ class Table(object): "encoding": "utf-8", } kw.update(kwargs) + # pylint: disable=abstract-class-instantiated writer = pd.ExcelWriter(path) self.df.to_excel(writer, **kw) writer.save() @@ -692,7 +703,8 @@ class TableList(object): ext = kwargs.get("ext") for table in self._tables: filename = os.path.join( - "{}-page-{}-table-{}{}".format(root, table.page, table.order, ext) + "{}-page-{}-table-{}{}".format(root, table.page, table.order, + ext) ) filepath = os.path.join(dirname, filename) to_format = self._format_func(table, f) @@ -707,7 +719,10 @@ class TableList(object): with zipfile.ZipFile(zipname, "w", allowZip64=True) as z: for table in self._tables: filename = os.path.join( - "{}-page-{}-table-{}{}".format(root, table.page, table.order, ext) + "{}-page-{}-table-{}{}".format(root, + table.page, + table.order, + ext) ) filepath = os.path.join(dirname, filename) z.write(filepath, os.path.basename(filepath)) @@ -739,10 +754,12 @@ class TableList(object): self._compress_dir(**kwargs) elif f == "excel": filepath = os.path.join(dirname, basename) + # pylint: disable=abstract-class-instantiated writer = pd.ExcelWriter(filepath) for table in self._tables: sheet_name = "page-{}-table-{}".format(table.page, table.order) - table.df.to_excel(writer, sheet_name=sheet_name, encoding="utf-8") + table.df.to_excel(writer, sheet_name=sheet_name, + encoding="utf-8") writer.save() if compress: zipname = os.path.join(os.path.dirname(path), root) + ".zip" diff --git a/camelot/handlers.py b/camelot/handlers.py index 3a6d663..a689ee5 100644 --- a/camelot/handlers.py +++ b/camelot/handlers.py @@ -113,14 +113,20 @@ class PDFHandler(object): outfile.addPage(p) with open(fpath, "wb") as f: outfile.write(f) - layout, dim = get_page_layout(fpath) + layout, __ = get_page_layout(fpath) # fix rotated PDF chars = get_text_objects(layout, ltype="char") horizontal_text = get_text_objects(layout, ltype="horizontal_text") vertical_text = get_text_objects(layout, ltype="vertical_text") rotation = get_rotation(chars, horizontal_text, vertical_text) if rotation != "": - fpath_new = "".join([froot.replace("page", "p"), "_rotated", fext]) + fpath_new = "".join( + [ + froot.replace("page", "p"), + "_rotated", + fext + ] + ) os.rename(fpath, fpath_new) infile = PdfFileReader(open(fpath_new, "rb"), strict=False) if infile.isEncrypted: @@ -136,7 +142,8 @@ class PDFHandler(object): outfile.write(f) def parse( - self, flavor="lattice", suppress_stdout=False, layout_kwargs={}, **kwargs + self, flavor="lattice", suppress_stdout=False, layout_kwargs=None, + **kwargs ): """Extracts tables by calling parser.get_tables on all single page PDFs. @@ -149,7 +156,7 @@ class PDFHandler(object): suppress_stdout : str (default: False) Suppress logs and warnings. layout_kwargs : dict, optional (default: {}) - A dict of `pdfminer.layout.LAParams `_ kwargs. + A dict of `pdfminer.layout.LAParams `_ kwargs. # noqa kwargs : dict See camelot.read_pdf kwargs. @@ -159,17 +166,21 @@ class PDFHandler(object): List of tables found in PDF. """ + layout_kwargs = layout_kwargs or {} tables = [] with TemporaryDirectory() as tempdir: for p in self.pages: self._save_page(self.filepath, p, tempdir) pages = [ - os.path.join(tempdir, "page-{0}.pdf".format(p)) for p in self.pages + os.path.join(tempdir, "page-{0}.pdf".format(p)) + for p in self.pages ] - parser = Lattice(**kwargs) if flavor == "lattice" else Stream(**kwargs) + parser = Lattice(**kwargs) \ + if flavor == "lattice" else Stream(**kwargs) for p in pages: t = parser.extract_tables( - p, suppress_stdout=suppress_stdout, layout_kwargs=layout_kwargs + p, suppress_stdout=suppress_stdout, + layout_kwargs=layout_kwargs ) tables.extend(t) return TableList(sorted(tables)) diff --git a/camelot/io.py b/camelot/io.py index a27a7c6..49d05cb 100644 --- a/camelot/io.py +++ b/camelot/io.py @@ -12,7 +12,7 @@ def read_pdf( password=None, flavor="lattice", suppress_stdout=False, - layout_kwargs={}, + layout_kwargs=None, **kwargs ): """Read PDF and return extracted tables. @@ -80,16 +80,16 @@ def read_pdf( Size of a pixel neighborhood that is used to calculate a threshold value for the pixel: 3, 5, 7, and so on. - For more information, refer `OpenCV's adaptiveThreshold `_. + For more information, refer `OpenCV's adaptiveThreshold `_. # noqa threshold_constant* : int, optional (default: -2) Constant subtracted from the mean or weighted mean. Normally, it is positive but may be zero or negative as well. - For more information, refer `OpenCV's adaptiveThreshold `_. + For more information, refer `OpenCV's adaptiveThreshold `_. # noqa iterations* : int, optional (default: 0) Number of times for erosion/dilation is applied. - For more information, refer `OpenCV's dilate `_. + For more information, refer `OpenCV's dilate `_. # noqa resolution* : int, optional (default: 300) Resolution used for PDF to PNG conversion. @@ -98,6 +98,7 @@ def read_pdf( tables : camelot.core.TableList """ + layout_kwargs = layout_kwargs or {} if flavor not in ["lattice", "stream"]: raise NotImplementedError( "Unknown flavor specified." " Use either 'lattice' or 'stream'" diff --git a/camelot/parsers/base.py b/camelot/parsers/base.py index cb1bc21..5713625 100644 --- a/camelot/parsers/base.py +++ b/camelot/parsers/base.py @@ -12,9 +12,18 @@ class BaseParser(object): def _generate_layout(self, filename, layout_kwargs): self.filename = filename self.layout_kwargs = layout_kwargs - self.layout, self.dimensions = get_page_layout(filename, **layout_kwargs) + self.layout, self.dimensions = get_page_layout( + filename, + **layout_kwargs + ) self.images = get_text_objects(self.layout, ltype="image") - self.horizontal_text = get_text_objects(self.layout, ltype="horizontal_text") - self.vertical_text = get_text_objects(self.layout, ltype="vertical_text") + self.horizontal_text = get_text_objects( + self.layout, + ltype="horizontal_text" + ) + self.vertical_text = get_text_objects( + self.layout, + ltype="vertical_text" + ) self.pdf_width, self.pdf_height = self.dimensions self.rootname, __ = os.path.splitext(self.filename) diff --git a/camelot/parsers/lattice.py b/camelot/parsers/lattice.py index 197ff9f..3a40f47 100644 --- a/camelot/parsers/lattice.py +++ b/camelot/parsers/lattice.py @@ -2,14 +2,10 @@ from __future__ import division import os -import sys import copy -import locale import logging import warnings -import subprocess -import numpy as np import pandas as pd from .base import BaseParser @@ -80,7 +76,7 @@ class Lattice(BaseParser): Size of a pixel neighborhood that is used to calculate a threshold value for the pixel: 3, 5, 7, and so on. - For more information, refer `OpenCV's adaptiveThreshold `_. + For more information, refer `OpenCV's adaptiveThreshold `_. # noqa threshold_constant : int, optional (default: -2) Constant subtracted from the mean or weighted mean. Normally, it is positive but may be zero or negative as well. @@ -102,7 +98,7 @@ class Lattice(BaseParser): process_background=False, line_scale=15, copy_text=None, - shift_text=["l", "t"], + shift_text=None, split_text=False, flag_size=False, strip_text="", @@ -114,6 +110,7 @@ class Lattice(BaseParser): resolution=300, **kwargs ): + shift_text = shift_text or ["l", "t"] self.table_regions = table_regions self.table_areas = table_areas self.process_background = process_background @@ -217,8 +214,7 @@ class Lattice(BaseParser): ) gs_call = gs_call.encode().split() null = open(os.devnull, "wb") - with Ghostscript(*gs_call, stdout=null) as gs: - pass + Ghostscript(*gs_call, stdout=null) null.close() def _generate_table_bbox(self): @@ -247,7 +243,8 @@ class Lattice(BaseParser): image_height_scaler = image_height / float(self.pdf_height) pdf_width_scaler = self.pdf_width / float(image_width) pdf_height_scaler = self.pdf_height / float(image_height) - image_scalers = (image_width_scaler, image_height_scaler, self.pdf_height) + image_scalers = (image_width_scaler, + image_height_scaler, self.pdf_height) pdf_scalers = (pdf_width_scaler, pdf_height_scaler, image_height) if self.table_areas is None: @@ -291,7 +288,11 @@ class Lattice(BaseParser): self.table_bbox_unscaled = copy.deepcopy(table_bbox) - self.table_bbox, self.vertical_segments, self.horizontal_segments = scale_image( + [ + self.table_bbox, + self.vertical_segments, + self.horizontal_segments + ] = scale_image( table_bbox, vertical_segments, horizontal_segments, pdf_scalers ) @@ -315,7 +316,10 @@ class Lattice(BaseParser): rows.extend([tk[1], tk[3]]) # sort horizontal and vertical segments cols = merge_close_lines(sorted(cols), line_tol=self.line_tol) - rows = merge_close_lines(sorted(rows, reverse=True), line_tol=self.line_tol) + rows = merge_close_lines( + sorted(rows, reverse=True), + line_tol=self.line_tol + ) # make grid using x and y coord of shortlisted rows and cols cols = [(cols[i], cols[i + 1]) for i in range(0, len(cols) - 1)] rows = [(rows[i], rows[i + 1]) for i in range(0, len(rows) - 1)] @@ -359,7 +363,10 @@ class Lattice(BaseParser): accuracy = compute_accuracy([[100, pos_errors]]) if self.copy_text is not None: - table = Lattice._copy_spanning_text(table, copy_text=self.copy_text) + table = Lattice._copy_spanning_text( + table, + copy_text=self.copy_text + ) data = table.data table.df = pd.DataFrame(data) @@ -383,20 +390,28 @@ class Lattice(BaseParser): return table - def extract_tables(self, filename, suppress_stdout=False, layout_kwargs={}): + def extract_tables( + self, + filename, + suppress_stdout=False, + layout_kwargs=None + ): + layout_kwargs = layout_kwargs or {} self._generate_layout(filename, layout_kwargs) + rootname = os.path.basename(self.rootname) if not suppress_stdout: - logger.info("Processing {}".format(os.path.basename(self.rootname))) + logger.info("Processing {rootname}".format(rootname=rootname)) if not self.horizontal_text: if self.images: warnings.warn( - "{} is image-based, camelot only works on" - " text-based pages.".format(os.path.basename(self.rootname)) + "{rootname} is image-based, " + "camelot only works on text-based pages." + .format(rootname=rootname) ) else: warnings.warn( - "No tables found on {}".format(os.path.basename(self.rootname)) + "No tables found on {rootname}".format(rootname=rootname) ) return [] @@ -408,8 +423,10 @@ class Lattice(BaseParser): for table_idx, tk in enumerate( sorted(self.table_bbox.keys(), key=lambda x: x[1], reverse=True) ): - cols, rows, v_s, h_s = self._generate_columns_and_rows(table_idx, tk) - table = self._generate_table(table_idx, cols, rows, v_s=v_s, h_s=h_s) + cols, rows, v_s, h_s = self._generate_columns_and_rows( + table_idx, tk) + table = self._generate_table( + table_idx, cols, rows, v_s=v_s, h_s=h_s) table._bbox = tk _tables.append(table) diff --git a/camelot/parsers/stream.py b/camelot/parsers/stream.py index 33f2fe5..4af0a0e 100644 --- a/camelot/parsers/stream.py +++ b/camelot/parsers/stream.py @@ -10,7 +10,8 @@ import pandas as pd from .base import BaseParser from ..core import TextEdges, Table -from ..utils import text_in_bbox, get_table_index, compute_accuracy, compute_whitespace +from ..utils import (text_in_bbox, get_table_index, compute_accuracy, + compute_whitespace) logger = logging.getLogger("camelot") @@ -70,6 +71,9 @@ class Stream(BaseParser): ): self.table_regions = table_regions self.table_areas = table_areas + self.table_bbox = None + self.t_bbox = None + self.textedges = [] self.columns = columns self._validate_columns() self.split_text = split_text @@ -95,10 +99,10 @@ class Stream(BaseParser): Tuple (x0, y0, x1, y1) in pdf coordinate space. """ - xmin = min([t.x0 for direction in t_bbox for t in t_bbox[direction]]) - ymin = min([t.y0 for direction in t_bbox for t in t_bbox[direction]]) - xmax = max([t.x1 for direction in t_bbox for t in t_bbox[direction]]) - ymax = max([t.y1 for direction in t_bbox for t in t_bbox[direction]]) + xmin = min(t.x0 for direction in t_bbox for t in t_bbox[direction]) + ymin = min(t.y0 for direction in t_bbox for t in t_bbox[direction]) + xmax = max(t.x1 for direction in t_bbox for t in t_bbox[direction]) + ymax = max(t.y1 for direction in t_bbox for t in t_bbox[direction]) text_bbox = (xmin, ymin, xmax, ymax) return text_bbox @@ -119,21 +123,25 @@ class Stream(BaseParser): Two-dimensional list of text objects grouped into rows. """ - row_y = 0 + row_y = None rows = [] temp = [] - for t in text: + non_empty_text = [t for t in text if t.get_text().strip()] + for t in non_empty_text: # is checking for upright necessary? - # if t.get_text().strip() and all([obj.upright for obj in t._objs if - # type(obj) is LTChar]): - if t.get_text().strip(): - if not np.isclose(row_y, t.y0, atol=row_tol): - rows.append(sorted(temp, key=lambda t: t.x0)) - temp = [] - row_y = t.y0 - temp.append(t) + # if t.get_text().strip() and all([obj.upright \ + # for obj in t._objs + # if type(obj) is LTChar]): + if row_y is None: + row_y = t.y0 + elif not np.isclose(row_y, t.y0, atol=row_tol): + rows.append(sorted(temp, key=lambda t: t.x0)) + temp = [] + # We update the row's bottom as we go, to be forgiving if there + # is a gradual change across multiple columns. + row_y = t.y0 + temp.append(t) rows.append(sorted(temp, key=lambda t: t.x0)) - __ = rows.pop(0) # TODO: hacky return rows @staticmethod @@ -170,7 +178,8 @@ class Stream(BaseParser): merged.append(higher) elif column_tol < 0: if higher[0] <= lower[1]: - if np.isclose(higher[0], lower[1], atol=abs(column_tol)): + if np.isclose(higher[0], lower[1], + atol=abs(column_tol)): merged.append(higher) else: upper_bound = max(lower[1], higher[1]) @@ -198,10 +207,13 @@ class Stream(BaseParser): """ row_mids = [ - sum([(t.y0 + t.y1) / 2 for t in r]) / len(r) if len(r) > 0 else 0 + sum((t.y0 + t.y1) / 2 for t in r) / len(r) if len(r) > 0 else 0 for r in rows_grouped ] - rows = [(row_mids[i] + row_mids[i - 1]) / 2 for i in range(1, len(row_mids))] + rows = [ + (row_mids[i] + row_mids[i - 1]) / 2 + for i in range(1, len(row_mids)) + ] rows.insert(0, text_y_max) rows.append(text_y_min) rows = [(rows[i], rows[i + 1]) for i in range(0, len(rows) - 1)] @@ -230,7 +242,9 @@ class Stream(BaseParser): text = Stream._group_rows(text, row_tol=row_tol) elements = [len(r) for r in text] new_cols = [ - (t.x0, t.x1) for r in text if len(r) == max(elements) for t in r + (t.x0, t.x1) + for r in text if len(r) == max(elements) + for t in r ] cols.extend(Stream._merge_columns(sorted(new_cols))) return cols @@ -262,12 +276,13 @@ class Stream(BaseParser): def _validate_columns(self): if self.table_areas is not None and self.columns is not None: if len(self.table_areas) != len(self.columns): - raise ValueError("Length of table_areas and columns" " should be equal") + raise ValueError("Length of table_areas and columns" + " should be equal") def _nurminen_table_detection(self, textlines): """A general implementation of the table detection algorithm described by Anssi Nurminen's master's thesis. - Link: https://dspace.cc.tut.fi/dpub/bitstream/handle/123456789/21520/Nurminen.pdf?sequence=3 + Link: https://dspace.cc.tut.fi/dpub/bitstream/handle/123456789/21520/Nurminen.pdf?sequence=3 # noqa Assumes that tables are situated relatively far apart vertically. @@ -284,7 +299,7 @@ class Stream(BaseParser): # guess table areas using textlines and relevant edges table_bbox = textedges.get_table_areas(textlines, relevant_textedges) # treat whole page as table area if no table areas found - if not len(table_bbox): + if not table_bbox: table_bbox = {(0, 0, self.pdf_width, self.pdf_height): None} return table_bbox @@ -302,7 +317,8 @@ class Stream(BaseParser): y1 = float(y1) x2 = float(x2) y2 = float(y2) - region_text = text_in_bbox((x1, y2, x2, y1), self.horizontal_text) + region_text = text_in_bbox( + (x1, y2, x2, y1), self.horizontal_text) hor_text.extend(region_text) # find tables based on nurminen's detection algorithm table_bbox = self._nurminen_table_detection(hor_text) @@ -328,8 +344,10 @@ class Stream(BaseParser): self.t_bbox = t_bbox - text_x_min, text_y_min, text_x_max, text_y_max = self._text_bbox(self.t_bbox) - rows_grouped = self._group_rows(self.t_bbox["horizontal"], row_tol=self.row_tol) + text_x_min, text_y_min, text_x_max, text_y_max = \ + self._text_bbox(self.t_bbox) + rows_grouped = self._group_rows( + self.t_bbox["horizontal"], row_tol=self.row_tol) rows = self._join_rows(rows_grouped, text_y_max, text_y_min) elements = [len(r) for r in rows_grouped] @@ -354,14 +372,23 @@ class Stream(BaseParser): # see if the list contains elements, if yes, then use # the mode after removing 1s elements = list(filter(lambda x: x != 1, elements)) - if len(elements): + if elements: ncols = max(set(elements), key=elements.count) else: warnings.warn( - "No tables found in table area {}".format(table_idx + 1) + "No tables found in table area {}" + .format(table_idx + 1) ) - cols = [(t.x0, t.x1) for r in rows_grouped if len(r) == ncols for t in r] - cols = self._merge_columns(sorted(cols), column_tol=self.column_tol) + cols = [ + (t.x0, t.x1) + for r in rows_grouped + if len(r) == ncols + for t in r + ] + cols = self._merge_columns( + sorted(cols), + column_tol=self.column_tol + ) inner_text = [] for i in range(1, len(cols)): left = cols[i - 1][1] @@ -431,23 +458,30 @@ class Stream(BaseParser): return table - def extract_tables(self, filename, suppress_stdout=False, layout_kwargs={}): + def extract_tables(self, filename, suppress_stdout=False, + layout_kwargs=None): + layout_kwargs = layout_kwargs or {} self._generate_layout(filename, layout_kwargs) if not suppress_stdout: - logger.info("Processing {}".format(os.path.basename(self.rootname))) + logger.info("Processing {}".format( + os.path.basename(self.rootname))) if not self.horizontal_text: if self.images: warnings.warn( "{} is image-based, camelot only works on" - " text-based pages.".format(os.path.basename(self.rootname)) + " text-based pages.".format( + os.path.basename(self.rootname)) ) else: warnings.warn( - "No tables found on {}".format(os.path.basename(self.rootname)) + "No tables found on {}".format( + os.path.basename(self.rootname)) ) return [] + # Identify plausible areas within the doc where tables lie, + # populate table_bbox keys with these areas. self._generate_table_bbox() _tables = [] diff --git a/camelot/plotting.py b/camelot/plotting.py index 5e0dc0c..51928e9 100644 --- a/camelot/plotting.py +++ b/camelot/plotting.py @@ -37,7 +37,7 @@ class PlotMethods(object): raise NotImplementedError( "Lattice flavor does not support kind='{}'".format(kind) ) - elif table.flavor == "stream" and kind in ["joint", "line"]: + elif table.flavor == "stream" and kind in ["line"]: raise NotImplementedError( "Stream flavor does not support kind='{}'".format(kind) ) @@ -64,7 +64,13 @@ class PlotMethods(object): for t in table._text: xs.extend([t[0], t[2]]) ys.extend([t[1], t[3]]) - ax.add_patch(patches.Rectangle((t[0], t[1]), t[2] - t[0], t[3] - t[1])) + ax.add_patch( + patches.Rectangle( + (t[0], t[1]), + t[2] - t[0], + t[3] - t[1] + ) + ) ax.set_xlim(min(xs) - 10, max(xs) + 10) ax.set_ylim(min(ys) - 10, max(ys) + 10) return fig @@ -132,7 +138,8 @@ class PlotMethods(object): for t in table_bbox.keys(): ax.add_patch( patches.Rectangle( - (t[0], t[1]), t[2] - t[0], t[3] - t[1], fill=False, color="red" + (t[0], t[1]), t[2] - t[0], t[3] - t[1], + fill=False, color="red" ) ) if not _FOR_LATTICE: @@ -164,7 +171,10 @@ class PlotMethods(object): xs.extend([t[0], t[2]]) ys.extend([t[1], t[3]]) ax.add_patch( - patches.Rectangle((t[0], t[1]), t[2] - t[0], t[3] - t[1], color="blue") + patches.Rectangle( + (t[0], t[1]), t[2] - t[0], t[3] - t[1], + color="blue" + ) ) ax.set_xlim(min(xs) - 10, max(xs) + 10) ax.set_ylim(min(ys) - 10, max(ys) + 10) diff --git a/camelot/utils.py b/camelot/utils.py index e7ad848..c3bf723 100644 --- a/camelot/utils.py +++ b/camelot/utils.py @@ -30,6 +30,9 @@ from pdfminer.layout import ( ) +# pylint: disable=import-error +# PyLint will evaluate both branches, and will necessarily complain about one +# of them. PY3 = sys.version_info[0] >= 3 if PY3: from urllib.request import urlopen @@ -310,7 +313,8 @@ def get_rotation(chars, horizontal_text, vertical_text): if hlen < vlen: clockwise = sum(t.matrix[1] < 0 and t.matrix[2] > 0 for t in chars) anticlockwise = sum(t.matrix[1] > 0 and t.matrix[2] < 0 for t in chars) - rotation = "anticlockwise" if clockwise < anticlockwise else "clockwise" + rotation = "anticlockwise" if clockwise < anticlockwise \ + else "clockwise" return rotation @@ -341,12 +345,16 @@ def segments_in_bbox(bbox, v_segments, h_segments): v_s = [ v for v in v_segments - if v[1] > lb[1] - 2 and v[3] < rt[1] + 2 and lb[0] - 2 <= v[0] <= rt[0] + 2 + if v[1] > lb[1] - 2 and + v[3] < rt[1] + 2 and + lb[0] - 2 <= v[0] <= rt[0] + 2 ] h_s = [ h for h in h_segments - if h[0] > lb[0] - 2 and h[2] < rt[0] + 2 and lb[1] - 2 <= h[1] <= rt[1] + 2 + if h[0] > lb[0] - 2 and + h[2] < rt[0] + 2 and + lb[1] - 2 <= h[1] <= rt[1] + 2 ] return v_s, h_s @@ -464,10 +472,10 @@ def flag_font_size(textline, direction, strip_text=""): for t in textline if not isinstance(t, LTAnno) ] - l = [np.round(size, decimals=6) for text, size in d] - if len(set(l)) > 1: + text_sizes = [np.round(size, decimals=6) for text, size in d] + if len(set(text_sizes)) > 1: flist = [] - min_size = min(l) + min_size = min(text_sizes) for key, chars in groupby(d, itemgetter(1)): if key == min_size: fchars = [t[0] for t in chars] @@ -511,7 +519,6 @@ def split_textline(table, textline, direction, flag_size=False, strip_text=""): of row/column and text is the an lttextline substring. """ - idx = 0 cut_text = [] bbox = textline.bbox try: @@ -528,7 +535,9 @@ def split_textline(table, textline, direction, flag_size=False, strip_text=""): ] r = r_idx[0] x_cuts = [ - (c, table.cells[r][c].x2) for c in x_overlap if table.cells[r][c].right + (c, table.cells[r][c].x2) + for c in x_overlap + if table.cells[r][c].right ] if not x_cuts: x_cuts = [(x_overlap[0], table.cells[r][-1].x2)] @@ -561,7 +570,9 @@ def split_textline(table, textline, direction, flag_size=False, strip_text=""): ] c = c_idx[0] y_cuts = [ - (r, table.cells[r][c].y1) for r in y_overlap if table.cells[r][c].bottom + (r, table.cells[r][c].y1) + for r in y_overlap + if table.cells[r][c].bottom ] if not y_cuts: y_cuts = [(y_overlap[0], table.cells[-1][c].y1)] @@ -644,9 +655,8 @@ def get_table_index( """ r_idx, c_idx = [-1] * 2 for r in range(len(table.rows)): - if (t.y0 + t.y1) / 2.0 < table.rows[r][0] and (t.y0 + t.y1) / 2.0 > table.rows[ - r - ][1]: + if (t.y0 + t.y1) / 2.0 < table.rows[r][0] and \ + (t.y0 + t.y1) / 2.0 > table.rows[r][1]: lt_col_overlap = [] for c in table.cols: if c[0] <= t.x1 and c[1] >= t.x0: @@ -681,7 +691,9 @@ def get_table_index( X = 1.0 if abs(t.x0 - t.x1) == 0.0 else abs(t.x0 - t.x1) Y = 1.0 if abs(t.y0 - t.y1) == 0.0 else abs(t.y0 - t.y1) charea = X * Y - error = ((X * (y0_offset + y1_offset)) + (Y * (x0_offset + x1_offset))) / charea + error = ( + (X * (y0_offset + y1_offset)) + (Y * (x0_offset + x1_offset)) + ) / charea if split_text: return ( @@ -697,13 +709,16 @@ def get_table_index( ( r_idx, c_idx, - flag_font_size(t._objs, direction, strip_text=strip_text), + flag_font_size(t._objs, + direction, + strip_text=strip_text), ) ], error, ) else: - return [(r_idx, c_idx, text_strip(t.get_text(), strip_text))], error + return [(r_idx, c_idx, text_strip(t.get_text(), strip_text))], \ + error def compute_accuracy(error_weights): @@ -751,7 +766,6 @@ def compute_whitespace(d): """ whitespace = 0 - r_nempty_cells, c_nempty_cells = [], [] for i in d: for j in i: if j.strip() == "": @@ -811,6 +825,7 @@ def get_page_layout( width = layout.bbox[2] height = layout.bbox[3] dim = (width, height) + break # we assume a single page pdf return layout, dim diff --git a/docs/user/install.rst b/docs/user/install.rst index b3d4813..4bbf491 100644 --- a/docs/user/install.rst +++ b/docs/user/install.rst @@ -13,7 +13,7 @@ The easiest way to install Camelot is to install it with `conda`_, which is a pa $ conda install -c conda-forge camelot-py -.. note:: Camelot is available for Python 2.7, 3.5, 3.6 and 3.7 on Linux, macOS and Windows. For Windows, you will need to install ghostscript which you can get from their `downloads page`_. +.. note:: Camelot is available for Python 3.5, 3.6 and 3.7 on Linux, macOS and Windows. For Windows, you will need to install ghostscript which you can get from their `downloads page`_. .. _conda: https://conda.io/docs/ .. _Anaconda: http://docs.continuum.io/anaconda/ diff --git a/requirements.txt b/requirements.txt index f815e26..764c037 100755 --- a/requirements.txt +++ b/requirements.txt @@ -4,5 +4,5 @@ numpy>=1.13.3 opencv-python>=3.4.2.17 openpyxl>=2.5.8 pandas>=0.23.4 -pdfminer.six>=20170720 +pdfminer.six>=20200402 PyPDF2>=1.26.0 diff --git a/setup.py b/setup.py index b83f566..b2e90f5 100644 --- a/setup.py +++ b/setup.py @@ -19,7 +19,7 @@ requires = [ 'numpy>=1.13.3', 'openpyxl>=2.5.8', 'pandas>=0.23.4', - 'pdfminer.six>=20170720', + 'pdfminer.six>=20200402', 'PyPDF2>=1.26.0' ] @@ -69,9 +69,8 @@ def setup_package(): }, classifiers=[ # Trove classifiers - # Full list: https://pypi.python.org/pypi?%3Aaction=list_classifiers + # Full list: https://pypi.python.org/pypi?%3Aaction=list_classifiers # noqa 'License :: OSI Approved :: MIT License', - 'Programming Language :: Python :: 2.7', 'Programming Language :: Python :: 3.5', 'Programming Language :: Python :: 3.6', 'Programming Language :: Python :: 3.7' diff --git a/tests/data.py b/tests/data.py index 3338a81..1017160 100755 --- a/tests/data.py +++ b/tests/data.py @@ -4,16 +4,6 @@ from __future__ import unicode_literals data_stream = [ - [ - "", - "Table: 5 Public Health Outlay 2012-13 (Budget Estimates) (Rs. in 000)", - "", - "", - "", - "", - "", - "", - ], ["States-A", "Revenue", "", "Capital", "", "Total", "Others(1)", "Total"], ["", "", "", "", "", "Revenue &", "", ""], ["", "Medical &", "Family", "Medical &", "Family", "", "", ""], @@ -80,7 +70,8 @@ data_stream = [ "5,000", "33,051,480", ], - ["Goa", "4,055,567", "110,000", "330,053", "0", "4,495,620", "12,560", "4,508,180"], + ["Goa", "4,055,567", "110,000", "330,053", "0", "4,495,620", "12,560", + "4,508,180"], [ "Gujarat", "26,328,400", @@ -171,7 +162,8 @@ data_stream = [ "313,762", "67,044,159", ], - ["Manipur", "2,494,600", "187,700", "897,400", "0", "3,579,700", "0", "3,579,700"], + ["Manipur", "2,494,600", "187,700", "897,400", "0", "3,579,700", + "0", "3,579,700"], [ "Meghalaya", "2,894,093", @@ -236,7 +228,8 @@ data_stream = [ data_stream_table_rotated = [ [ - "Table 21 Current use of contraception by background characteristics\u2014Continued", + "Table 21 Current use of contraception by background characteristics" + "\u2014Continued", "", "", "", @@ -330,7 +323,8 @@ data_stream_table_rotated = [ "Total", "women", ], - ["Caste/tribe", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", ""], + ["Caste/tribe", "", "", "", "", "", "", "", "", "", "", "", "", "", "", + "", ""], [ "Scheduled caste", "74.8", @@ -407,7 +401,8 @@ data_stream_table_rotated = [ "100.0", "3,319", ], - ["Wealth index", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", ""], + ["Wealth index", "", "", "", "", "", "", "", "", "", "", "", "", + "", "", "", ""], [ "Lowest", "64.5", @@ -830,7 +825,8 @@ data_stream_table_rotated = [ data_stream_two_tables_1 = [ [ - "[In thousands (11,062.6 represents 11,062,600) For year ending December 31. Based on Uniform Crime Reporting (UCR)", + "Program. Represents arrests reported (not charged) by 12,910 " + "agencies with a total population of 247,526,916 as estimated", "", "", "", @@ -842,7 +838,8 @@ data_stream_two_tables_1 = [ "", ], [ - "Program. Represents arrests reported (not charged) by 12,910 agencies with a total population of 247,526,916 as estimated", + "by the FBI. Some persons may be arrested more than once during a " + "year, therefore, the data in this table, in some cases,", "", "", "", @@ -854,19 +851,8 @@ data_stream_two_tables_1 = [ "", ], [ - "by the FBI. Some persons may be arrested more than once during a year, therefore, the data in this table, in some cases,", - "", - "", - "", - "", - "", - "", - "", - "", - "", - ], - [ - "could represent multiple arrests of the same person. See text, this section and source]", + "could represent multiple arrests of the same person. See text, " + "this section and source]", "", "", "", @@ -903,7 +889,8 @@ data_stream_two_tables_1 = [ "and over", ], [ - "Total .\n .\n . . . . . .\n . .\n . .\n . .\n . .\n . .\n . .\n . .\n . . .", + "Total .\n .\n . . . . . .\n . .\n . .\n . .\n . .\n . " + ".\n . .\n . .\n . . .", "11,062 .6", "1,540 .0", "9,522 .6", @@ -915,7 +902,8 @@ data_stream_two_tables_1 = [ "2,330 .9", ], [ - "Violent crime . . . . . . . .\n . .\n . .\n . .\n . .\n . .", + "Violent crime . . . . . . . .\n . .\n . .\n . .\n . " + ".\n . .", "467 .9", "69 .1", "398 .8", @@ -976,7 +964,8 @@ data_stream_two_tables_1 = [ "64.5", ], [ - "Property crime . . . .\n . .\n . . .\n . . .\n .\n . . . .", + "Property crime . . . .\n . .\n . . .\n . . .\n .\n . . " + ". .", "1,396 .4", "338 .7", "1,057 .7", @@ -1060,7 +1049,8 @@ data_stream_two_tables_1 = [ "25.5", ], [ - "Fraud .\n.\n.\n. .\n. . . .\n. .\n.\n. .\n.\n.\n. .\n.\n.\n. .\n.\n.\n.", + "Fraud .\n.\n.\n. .\n. . . .\n. .\n.\n. .\n.\n.\n. .\n.\n.\n. " + ".\n.\n.\n.", "173.7", "5.1", "168.5", @@ -1290,19 +1280,8 @@ data_stream_two_tables_1 = [ ], [ "", - "– Represents zero. X Not applicable. 1 Buying, receiving, possessing stolen property. 2 Except forcible rape and prostitution.", - "", - "", - "", - "", - "", - "", - "", - "", - ], - [ - "", - "Source: U.S. Department of Justice, Federal Bureau of Investigation, Uniform Crime Reports, Arrests Master Files.", + "– Represents zero. X Not applicable. 1 Buying, receiving, " + "possessing stolen property. 2 Except forcible rape and prostitution.", "", "", "", @@ -1315,17 +1294,10 @@ data_stream_two_tables_1 = [ ] data_stream_two_tables_2 = [ - [ - "", - "Source: U.S. Department of Justice, Federal Bureau of Investigation, Uniform Crime Reports, Arrests Master Files.", - "", - "", - "", - "", - ], ["Table 325. Arrests by Race: 2009", "", "", "", "", ""], [ - "[Based on Uniform Crime Reporting (UCR) Program. Represents arrests reported (not charged) by 12,371 agencies", + "[Based on Uniform Crime Reporting (UCR) Program. Represents " + "arrests reported (not charged) by 12,371 agencies", "", "", "", @@ -1333,7 +1305,8 @@ data_stream_two_tables_2 = [ "", ], [ - "with a total population of 239,839,971 as estimated by the FBI. See headnote, Table 324]", + "with a total population of 239,839,971 as estimated by the FBI. " + "See headnote, Table 324]", "", "", "", @@ -1344,7 +1317,8 @@ data_stream_two_tables_2 = [ ["Offense charged", "", "", "", "Indian/Alaskan", "Asian Pacific"], ["", "Total", "White", "Black", "Native", "Islander"], [ - "Total .\n .\n .\n .\n . .\n . . .\n . . .\n .\n . . .\n .\n . . .\n . .\n .\n . . .\n .\n .\n .\n . .\n . .\n . .", + "Total .\n .\n .\n .\n . .\n . . .\n . . .\n .\n . . .\n " + ".\n . . .\n . .\n .\n . . .\n .\n .\n .\n . .\n . .\n . .", "10,690,561", "7,389,208", "3,027,153", @@ -1352,7 +1326,8 @@ data_stream_two_tables_2 = [ "123,656", ], [ - "Violent crime . . . . . . . .\n . .\n . .\n . .\n . .\n .\n .\n . .\n . .\n .\n .\n .\n .\n . .", + "Violent crime . . . . . . . .\n . .\n . .\n . .\n . " + ".\n .\n .\n . .\n . .\n .\n .\n .\n .\n . .", "456,965", "268,346", "177,766", @@ -1368,7 +1343,8 @@ data_stream_two_tables_2 = [ "97", ], [ - "Forcible rape . . . . . . . .\n. .\n. .\n. .\n. .\n.\n.\n. .\n. .\n.\n.\n.\n.\n. .", + "Forcible rape . . . . . . . .\n. .\n. .\n. .\n. .\n.\n.\n. .\n. " + ".\n.\n.\n.\n.\n. .", "16,362", "10,644", "5,319", @@ -1376,7 +1352,8 @@ data_stream_two_tables_2 = [ "230", ], [ - "Robbery . . . . .\n. . . . .\n.\n. . .\n.\n. . .\n.\n.\n. .\n.\n.\n. .\n.\n.\n. . . .", + "Robbery . . . . .\n. . . . .\n.\n. . .\n.\n. . .\n.\n.\n. " + ".\n.\n.\n. .\n.\n.\n. . . .", "100,496", "43,039", "55,742", @@ -1384,7 +1361,8 @@ data_stream_two_tables_2 = [ "989", ], [ - "Aggravated assault . . . . . . . .\n. .\n. .\n.\n.\n.\n.\n. .\n. .\n.\n.\n.", + "Aggravated assault . . . . . . . .\n. .\n. .\n.\n.\n.\n.\n. .\n. " + ".\n.\n.\n.", "330,368", "209,922", "111,904", @@ -1392,7 +1370,8 @@ data_stream_two_tables_2 = [ "3,929", ], [ - "Property crime . . . . .\n . . . . .\n .\n . . .\n .\n . .\n .\n .\n .\n . .\n .\n . .\n .\n .", + "Property crime . . . . .\n . . . . .\n .\n . . .\n .\n " + ". .\n .\n .\n .\n . .\n .\n . .\n .\n .", "1,364,409", "922,139", "406,382", @@ -1400,7 +1379,8 @@ data_stream_two_tables_2 = [ "18,289", ], [ - "Burglary . . .\n. . . . .\n. . .\n.\n.\n. .\n.\n.\n. .\n.\n.\n. .\n.\n.\n. .\n.\n. . . .", + "Burglary . . .\n. . . . .\n. . .\n.\n.\n. .\n.\n.\n. .\n.\n.\n. " + ".\n.\n.\n. .\n.\n. . . .", "234,551", "155,994", "74,419", @@ -1408,7 +1388,8 @@ data_stream_two_tables_2 = [ "2,117", ], [ - "Larceny-theft . . . . . . . .\n. .\n. .\n. .\n. .\n.\n.\n. .\n. .\n.\n.\n.\n.\n. .", + "Larceny-theft . . . . . . . .\n. .\n. .\n. .\n. .\n.\n.\n. .\n. " + ".\n.\n.\n.\n.\n. .", "1,056,473", "719,983", "306,625", @@ -1416,7 +1397,8 @@ data_stream_two_tables_2 = [ "15,219", ], [ - "Motor vehicle theft . . . . . .\n. .\n.\n. . .\n.\n. .\n.\n.\n.\n. .\n.\n. .\n.", + "Motor vehicle theft . . . . . .\n. .\n.\n. . .\n.\n. .\n.\n.\n.\n. " + ".\n.\n. .\n.", "63,919", "39,077", "23,184", @@ -1424,7 +1406,8 @@ data_stream_two_tables_2 = [ "841", ], [ - "Arson .\n. . . .\n. .\n. .\n. .\n.\n. .\n.\n.\n. .\n.\n.\n. .\n.\n.\n. .\n.\n.\n. . . . . .", + "Arson .\n. . . .\n. .\n. .\n. .\n.\n. .\n.\n.\n. .\n.\n.\n. " + ".\n.\n.\n. .\n.\n.\n. . . . . .", "9,466", "7,085", "2,154", @@ -1432,7 +1415,8 @@ data_stream_two_tables_2 = [ "112", ], [ - "Other assaults .\n. . . . . . .\n.\n. . .\n.\n. . .\n.\n. .\n.\n.\n.\n. .\n.\n. .\n.", + "Other assaults .\n. . . . . . .\n.\n. . .\n.\n. . .\n.\n. " + ".\n.\n.\n.\n. .\n.\n. .\n.", "1,032,502", "672,865", "332,435", @@ -1440,7 +1424,8 @@ data_stream_two_tables_2 = [ "12,075", ], [ - "Forgery and counterfeiting .\n. . . . . . .\n.\n. .\n.\n.\n.\n. .\n. .\n.", + "Forgery and counterfeiting .\n. . . . . . .\n.\n. .\n.\n.\n.\n. " + ".\n. .\n.", "67,054", "44,730", "21,251", @@ -1448,7 +1433,8 @@ data_stream_two_tables_2 = [ "728", ], [ - "Fraud .\n.\n. . . . . .\n. .\n. .\n. .\n. .\n. .\n. .\n. .\n. .\n. .\n.\n.\n. . . . . . .", + "Fraud .\n.\n. . . . . .\n. .\n. .\n. .\n. .\n. .\n. .\n. .\n. .\n. " + ".\n.\n.\n. . . . . . .", "161,233", "108,032", "50,367", @@ -1456,7 +1442,8 @@ data_stream_two_tables_2 = [ "1,519", ], [ - "Embezzlement . . . .\n. . . . .\n.\n. . .\n.\n. . .\n.\n.\n. .\n.\n. .\n.\n.\n.\n.", + "Embezzlement . . . .\n. . . . .\n.\n. . .\n.\n. . .\n.\n.\n. " + ".\n.\n. .\n.\n.\n.\n.", "13,960", "9,208", "4,429", @@ -1472,7 +1459,8 @@ data_stream_two_tables_2 = [ "742", ], [ - "Vandalism . . . . . . . .\n. .\n. .\n. .\n. .\n. .\n. .\n.\n.\n. .\n. .\n.\n.\n.\n. .", + "Vandalism . . . . . . . .\n. .\n. .\n. .\n. .\n. .\n. .\n.\n.\n. " + ".\n. .\n.\n.\n.\n. .", "212,173", "157,723", "48,746", @@ -1496,7 +1484,8 @@ data_stream_two_tables_2 = [ "1,413", ], [ - "Sex offenses 1 . . . . . . . .\n. .\n. .\n. .\n. .\n.\n.\n. .\n. .\n.\n.\n.\n.\n. .", + "Sex offenses 1 . . . . . . . .\n. .\n. .\n. .\n. .\n.\n.\n. .\n. " + ".\n.\n.\n.\n.\n. .", "60,175", "44,240", "14,347", @@ -1504,7 +1493,8 @@ data_stream_two_tables_2 = [ "873", ], [ - "Drug abuse violations . . . . . . . .\n. . .\n.\n.\n.\n. .\n. .\n.\n.\n.\n.", + "Drug abuse violations . . . . . . . .\n. . .\n.\n.\n.\n. .\n. " + ".\n.\n.\n.\n.", "1,301,629", "845,974", "437,623", @@ -1512,7 +1502,8 @@ data_stream_two_tables_2 = [ "9,444", ], [ - "Gambling . . . . .\n. . . . .\n.\n. . .\n.\n. . .\n. .\n.\n. . .\n.\n.\n.\n.\n. .\n. .", + "Gambling . . . . .\n. . . . .\n.\n. . .\n.\n. . .\n. .\n.\n. . " + ".\n.\n.\n.\n.\n. .\n. .", "8,046", "2,290", "5,518", @@ -1528,7 +1519,8 @@ data_stream_two_tables_2 = [ "624", ], [ - "Driving under the influence . . . . . . .\n. .\n.\n. .\n.\n.\n.\n.\n. .", + "Driving under the influence . . . . . . .\n. .\n.\n. " + ".\n.\n.\n.\n.\n. .", "1,105,401", "954,444", "121,594", @@ -1536,7 +1528,8 @@ data_stream_two_tables_2 = [ "14,460", ], [ - "Liquor laws . . . . . . . .\n. .\n. .\n. .\n. .\n. . .\n.\n.\n.\n. .\n. .\n.\n.\n.\n.", + "Liquor laws . . . . . . . .\n. .\n. .\n. .\n. .\n. . " + ".\n.\n.\n.\n. .\n. .\n.\n.\n.\n.", "444,087", "373,189", "50,431", @@ -1544,7 +1537,8 @@ data_stream_two_tables_2 = [ "5,591", ], [ - "Drunkenness . .\n. . . . . . .\n.\n. . .\n.\n. . .\n.\n.\n.\n. . .\n.\n.\n.\n.\n.\n.", + "Drunkenness . .\n. . . . . . .\n.\n. . .\n.\n. . .\n.\n.\n.\n. . " + ".\n.\n.\n.\n.\n.\n.", "469,958", "387,542", "71,020", @@ -1552,7 +1546,8 @@ data_stream_two_tables_2 = [ "2,844", ], [ - "Disorderly conduct . . .\n. . . . . .\n. .\n. . .\n.\n.\n.\n. .\n. .\n.\n.\n.\n.", + "Disorderly conduct . . .\n. . . . . .\n. .\n. . .\n.\n.\n.\n. .\n. " + ".\n.\n.\n.\n.", "515,689", "326,563", "176,169", @@ -1560,7 +1555,8 @@ data_stream_two_tables_2 = [ "4,174", ], [ - "Vagrancy . . .\n. .\n. . . .\n. .\n.\n. .\n.\n.\n. .\n.\n.\n. .\n.\n.\n. .\n.\n.\n. . . .", + "Vagrancy . . .\n. .\n. . . .\n. .\n.\n. .\n.\n.\n. .\n.\n.\n. " + ".\n.\n.\n. .\n.\n.\n. . . .", "26,347", "14,581", "11,031", @@ -1568,7 +1564,8 @@ data_stream_two_tables_2 = [ "192", ], [ - "All other offenses (except traffic) . .\n. .\n. .\n. .\n.\n.\n.\n. .\n.", + "All other offenses (except traffic) . .\n. .\n. .\n. .\n.\n.\n.\n. " + ".\n.", "2,929,217", "1,937,221", "911,670", @@ -1576,7 +1573,8 @@ data_stream_two_tables_2 = [ "36,446", ], [ - "Suspicion . . .\n. . . . .\n. .\n. .\n. .\n. .\n. .\n. .\n. .\n.\n.\n.\n.\n. .\n. . . .", + "Suspicion . . .\n. . . . .\n. .\n. .\n. .\n. .\n. .\n. .\n. " + ".\n.\n.\n.\n.\n. .\n. . . .", "1,513", "677", "828", @@ -1592,7 +1590,8 @@ data_stream_two_tables_2 = [ "1,060", ], [ - "Runaways . . . . . . . .\n. .\n. .\n. .\n. .\n. .\n. .\n.\n.\n. .\n. .\n.\n.\n.\n. .", + "Runaways . . . . . . . .\n. .\n. .\n. .\n. .\n. .\n. .\n.\n.\n. " + ".\n. .\n.\n.\n.\n. .", "73,616", "48,343", "19,670", @@ -1600,14 +1599,6 @@ data_stream_two_tables_2 = [ "3,950", ], ["1 Except forcible rape and prostitution.", "", "", "", "", ""], - [ - "", - "Source: U.S. Department of Justice, Federal Bureau of Investigation, “Crime in the United States, Arrests,” September 2010,", - "", - "", - "", - "", - ], ] data_stream_table_areas = [ @@ -1634,10 +1625,12 @@ data_stream_columns = [ "Nombre Localidad", ], ["Entidad", "", "Municipio", "", "Localidad", ""], - ["01", "Aguascalientes", "001", "Aguascalientes", "0094", "Granja Adelita"], + ["01", "Aguascalientes", "001", "Aguascalientes", "0094", + "Granja Adelita"], ["01", "Aguascalientes", "001", "Aguascalientes", "0096", "Agua Azul"], ["01", "Aguascalientes", "001", "Aguascalientes", "0100", "Rancho Alegre"], - ["01", "Aguascalientes", "001", "Aguascalientes", "0102", "Los Arbolitos [Rancho]"], + ["01", "Aguascalientes", "001", "Aguascalientes", "0102", + "Los Arbolitos [Rancho]"], [ "01", "Aguascalientes", @@ -1655,7 +1648,8 @@ data_stream_columns = [ "0112", "Baj\xedo los V\xe1zquez", ], - ["01", "Aguascalientes", "001", "Aguascalientes", "0113", "Baj\xedo de Montoro"], + ["01", "Aguascalientes", "001", "Aguascalientes", "0113", + "Baj\xedo de Montoro"], [ "01", "Aguascalientes", @@ -1697,8 +1691,10 @@ data_stream_columns = [ "Ca\xf1ada Honda [Estaci\xf3n]", ], ["01", "Aguascalientes", "001", "Aguascalientes", "0127", "Los Ca\xf1os"], - ["01", "Aguascalientes", "001", "Aguascalientes", "0128", "El Cari\xf1\xe1n"], - ["01", "Aguascalientes", "001", "Aguascalientes", "0129", "El Carmen [Granja]"], + ["01", "Aguascalientes", "001", "Aguascalientes", "0128", + "El Cari\xf1\xe1n"], + ["01", "Aguascalientes", "001", "Aguascalientes", "0129", + "El Carmen [Granja]"], [ "01", "Aguascalientes", @@ -1733,9 +1729,11 @@ data_stream_columns = [ "El Colorado (El Soyatal)", ], ["01", "Aguascalientes", "001", "Aguascalientes", "0146", "El Conejal"], - ["01", "Aguascalientes", "001", "Aguascalientes", "0157", "Cotorina de Abajo"], + ["01", "Aguascalientes", "001", "Aguascalientes", "0157", + "Cotorina de Abajo"], ["01", "Aguascalientes", "001", "Aguascalientes", "0162", "Coyotes"], - ["01", "Aguascalientes", "001", "Aguascalientes", "0166", "La Huerta (La Cruz)"], + ["01", "Aguascalientes", "001", "Aguascalientes", "0166", + "La Huerta (La Cruz)"], [ "01", "Aguascalientes", @@ -1752,17 +1750,20 @@ data_stream_columns = [ "0171", "Los Cuervos (Los Ojos de Agua)", ], - ["01", "Aguascalientes", "001", "Aguascalientes", "0172", "San Jos\xe9 [Granja]"], + ["01", "Aguascalientes", "001", "Aguascalientes", "0172", + "San Jos\xe9 [Granja]"], ["01", "Aguascalientes", "001", "Aguascalientes", "0176", "La Chiripa"], ["01", "Aguascalientes", "001", "Aguascalientes", "0182", "Dolores"], ["01", "Aguascalientes", "001", "Aguascalientes", "0183", "Los Dolores"], ["01", "Aguascalientes", "001", "Aguascalientes", "0190", "El Duraznillo"], ["01", "Aguascalientes", "001", "Aguascalientes", "0191", "Los Dur\xf3n"], ["01", "Aguascalientes", "001", "Aguascalientes", "0197", "La Escondida"], - ["01", "Aguascalientes", "001", "Aguascalientes", "0201", "Brande Vin [Bodegas]"], + ["01", "Aguascalientes", "001", "Aguascalientes", "0201", + "Brande Vin [Bodegas]"], ["01", "Aguascalientes", "001", "Aguascalientes", "0207", "Valle Redondo"], ["01", "Aguascalientes", "001", "Aguascalientes", "0209", "La Fortuna"], - ["01", "Aguascalientes", "001", "Aguascalientes", "0212", "Lomas del Gachup\xedn"], + ["01", "Aguascalientes", "001", "Aguascalientes", "0212", + "Lomas del Gachup\xedn"], [ "01", "Aguascalientes", @@ -1772,22 +1773,12 @@ data_stream_columns = [ "El Carmen (Gallinas G\xfceras) [Rancho]", ], ["01", "Aguascalientes", "001", "Aguascalientes", "0216", "La Gloria"], - ["01", "Aguascalientes", "001", "Aguascalientes", "0226", "Hacienda Nueva"], + ["01", "Aguascalientes", "001", "Aguascalientes", "0226", + "Hacienda Nueva"], ] data_stream_split_text = [ - [ - "FEB", - "RUAR", - "Y 2014 M27 (BUS)", - "", - "ALPHABETIC LISTING BY T", - "YPE", - "", - "", - "", - "ABLPDM27", - ], + ["FEB", "RUAR", "Y 2014 M27 (BUS)", "", "", "", "", "", "", ""], ["", "", "", "", "OF ACTIVE LICENSES", "", "", "", "", "3/19/2014"], ["", "", "", "", "OKLAHOMA ABLE COMMIS", "SION", "", "", "", ""], ["LICENSE", "", "", "", "PREMISE", "", "", "", "", ""], @@ -1977,7 +1968,18 @@ data_stream_split_text = [ "(872) 825-8309", "2014/04/11", ], - ["", "", "A SENSU JAPANESE", "", "7123 SOUTH 92ND EAST", "", "", "", "", ""], + [ + "", + "", + "A SENSU JAPANESE", + "", + "7123 SOUTH 92ND EAST", + "", + "", + "", + "", + "", + ], [ "625422", "BAW", @@ -2029,7 +2031,18 @@ data_stream_split_text = [ "(580) 928-2700", "2014/09/08", ], - ["", "", "ANDOLINI'S PIZZERIA &", "", "12140 EAST 96TH STREET", "", "", "", "", ""], + [ + "", + "", + "ANDOLINI'S PIZZERIA &", + "", + "12140 EAST 96TH STREET", + "", + "", + "", + "", + "", + ], [ "428377", "BAW", @@ -2148,7 +2161,8 @@ data_stream_flag_size = [ "from SBI", "from", ], - ["", "Debt", "", "", "RBI", "Banks", "LIC", "GIC", "NABARD", "& Other", "NCDC"], + ["", "Debt", "", "", "RBI", "Banks", "LIC", "GIC", "NABARD", "& Other", + "NCDC"], ["", "", "", "", "", "& FIs", "", "", "", "Banks", ""], ["1", "2=", "3", "4", "5", "6=", "7", "8", "9", "10", "11"], ["", "(3 to 6)+14", "", "", "", "(7 to13)", "", "", "", "", ""], @@ -2165,7 +2179,8 @@ data_stream_flag_size = [ "-", "0.25", ], - ["Arunachal Pradesh", "1.23", "1.1", "-", "-", "0.13", "-", "-", "-", "-", "-"], + ["Arunachal Pradesh", "1.23", "1.1", "-", "-", "0.13", "-", "-", "-", + "-", "-"], [ "Assam", "12.69", @@ -2194,8 +2209,10 @@ data_stream_flag_size = [ ], ["Chhattisgarh", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-"], ["Goa", "1.4", "1.02", "-", "-", "0.38", "0.31", "-", "0.07", "-", "-"], - ["Gujarat", "19.75", "17.1", "-", "-", "2.64", "1.17", "-", "1.11", "-", "0.44"], - ["Haryana", "11.53", "9.67", "-", "0.06", "1.8", "0.55", "-", "0.64", "-", "0.49"], + ["Gujarat", "19.75", "17.1", "-", "-", "2.64", "1.17", "-", "1.11", + "-", "0.44"], + ["Haryana", "11.53", "9.67", "-", "0.06", "1.8", "0.55", "-", "0.64", + "-", "0.49"], [ "Himachal Pradesh", "8.02", @@ -2223,7 +2240,8 @@ data_stream_flag_size = [ "-", ], ["Jharkhand", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-"], - ["Karnataka", "22.44", "19.59", "-", "-", "2.86", "1.22", "-", "0.89", "-", "0.69"], + ["Karnataka", "22.44", "19.59", "-", "-", "2.86", "1.22", "-", "0.89", + "-", "0.69"], [ "Kerala", "29.03", @@ -2263,11 +2281,16 @@ data_stream_flag_size = [ "0.02", "2.89", ], - ["Manipur", "2.17", "1.61", "-", "0.26", "0.29", "0.08", "-", "-", "-", "0.09"], - ["Meghalaya", "1.36", "1.38", "-", "-", "-0.02", "0.04", "-", "-0.05", "-", "0.03"], - ["Mizoram", "1.17", "0.46", "-", "0.27", "0.43", "0.11", "-", "-", "-", "0.03"], - ["Nagaland", "2.99", "2.6", "-", "-", "0.39", "0.24", "-", "-", "-", "0.04"], - ["Odisha", "34.04", "27.58", "-", "4.4", "2.06", "0.56", "-", "0.66", "-", "0.2"], + ["Manipur", "2.17", "1.61", "-", "0.26", "0.29", "0.08", "-", "-", "-", + "0.09"], + ["Meghalaya", "1.36", "1.38", "-", "-", "-0.02", "0.04", "-", "-0.05", + "-", "0.03"], + ["Mizoram", "1.17", "0.46", "-", "0.27", "0.43", "0.11", "-", "-", + "-", "0.03"], + ["Nagaland", "2.99", "2.6", "-", "-", "0.39", "0.24", "-", "-", "-", + "0.04"], + ["Odisha", "34.04", "27.58", "-", "4.4", "2.06", "0.56", "-", "0.66", + "-", "0.2"], [ "Punjab", "19.18", @@ -2295,8 +2318,10 @@ data_stream_flag_size = [ "0.81", ], ["Sikkim", "0.16", "-", "-", "-", "0.16", "0.03", "-", "-", "-", "0.01"], - ["Tamil Nadu", "34.11", "31.41", "-", "-", "2.7", "1.3", "-", "0.6", "-", "0.68"], - ["Tripura", "2.3", "1.89", "-", "-", "0.41", "0.41", "-", "-0.05", "-", "0.02"], + ["Tamil Nadu", "34.11", "31.41", "-", "-", "2.7", "1.3", "-", "0.6", "-", + "0.68"], + ["Tripura", "2.3", "1.89", "-", "-", "0.41", "0.41", "-", "-0.05", "-", + "0.02"], ["Uttaranchal", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-"], [ "Uttar Pradesh", @@ -2393,11 +2418,13 @@ data_stream_edge_tol = [ ["Costs", "(0.21)"], ["T\notal investment result per unit", "3.78"], [ - "1 The results cover the period from inception of the Fund at 8 April 2016 through 31 December 2016.", + "1 The results cover the period from inception of the Fund at " + "8 April 2016 through 31 December 2016.", "", ], [ - "2 The result per unit is calculated using the total number of outstanding unit as per the end of the", + "2 The result per unit is calculated using the total number of " + "outstanding unit as per the end of the", "", ], ["period.", ""], @@ -2454,7 +2481,8 @@ data_lattice_table_rotated = [ "Men", "Women", ], - ["Kerala", "5738", "6633", "8864", "8297", "245", "2161", "3195", "1645", "2391"], + ["Kerala", "5738", "6633", "8864", "8297", "245", "2161", "3195", "1645", + "2391"], [ "Tamil Nadu", "7387", @@ -2503,11 +2531,16 @@ data_lattice_table_rotated = [ "1417", "1599", ], - ["Gujarat", "4403", "5374", "4866", "9645", "477", "2687", "3021", "2122", "2503"], - ["Madhya Pradesh", "*", "*", "*", "7942", "470", "1965", "2150", "1579", "1709"], - ["Orissa", "3756", "5540", "12024", "8473", "398", "2040", "2624", "1093", "1628"], - ["West Bengal", "*", "*", "*", "8047", "423", "2058", "2743", "1413", "2027"], - ["Uttar Pradesh", "*", "*", "*", "9860", "581", "2139", "2415", "1185", "1366"], + ["Gujarat", "4403", "5374", "4866", "9645", "477", "2687", "3021", "2122", + "2503"], + ["Madhya Pradesh", "*", "*", "*", "7942", "470", "1965", "2150", "1579", + "1709"], + ["Orissa", "3756", "5540", "12024", "8473", "398", "2040", "2624", "1093", + "1628"], + ["West Bengal", "*", "*", "*", "8047", "423", "2058", "2743", "1413", + "2027"], + ["Uttar Pradesh", "*", "*", "*", "9860", "581", "2139", "2415", "1185", + "1366"], [ "Pooled", "38742", @@ -2573,7 +2606,8 @@ data_lattice_two_tables_2 = [ ] data_lattice_table_regions = [ - ["Età dell’Assicurato \nall’epoca del decesso", "Misura % di \nmaggiorazione"], + ["Età dell’Assicurato \nall’epoca del decesso", + "Misura % di \nmaggiorazione"], ["18-75", "1,00%"], ["76-80", "0,50%"], ["81 in poi", "0,10%"], @@ -2596,10 +2630,12 @@ data_lattice_table_areas = [ ["Kerala", "2400", "7.2", "0.5", "25.3", "20.1", "41.5", "5.5", ""], ["Tamil Nadu", "2400", "21.4", "2.3", "8.8", "35.5", "25.8", "6.2", ""], ["Karnataka", "2399", "37.4", "2.8", "12.5", "18.3", "23.1", "5.8", ""], - ["Andhra Pradesh", "2400", "54.0", "1.7", "8.4", "13.2", "18.8", "3.9", ""], + ["Andhra Pradesh", "2400", "54.0", "1.7", "8.4", "13.2", "18.8", "3.9", + ""], ["Maharashtra", "2400", "22.0", "0.9", "17.3", "20.3", "32.6", "7.0", ""], ["Gujarat", "2390", "28.6", "0.1", "14.4", "23.1", "26.9", "6.8", ""], - ["Madhya Pradesh", "2402", "29.1", "3.4", "8.5", "35.1", "13.3", "10.6", ""], + ["Madhya Pradesh", "2402", "29.1", "3.4", "8.5", "35.1", "13.3", "10.6", + ""], ["Orissa", "2405", "33.2", "1.0", "10.4", "25.7", "21.2", "8.5", ""], ["West Bengal", "2293", "41.7", "4.4", "13.2", "17.1", "21.2", "2.4", ""], ["Uttar Pradesh", "2400", "35.3", "2.1", "4.5", "23.3", "27.1", "7.6", ""], @@ -2650,7 +2686,8 @@ data_lattice_process_background = [ "3,658", "3,183", ], - ["Kerala", "23.2.2010 to \n11.3.2010", "9", "17", "1.42", "3,559", "2,173", "855"], + ["Kerala", "23.2.2010 to \n11.3.2010", "9", "17", "1.42", "3,559", "2,173", + "855"], ["Total", "", "47", "92", "11.81", "22,455", "19,584", "10,644"], ] @@ -2689,7 +2726,8 @@ data_lattice_copy_text = [ ["COHS", "San Mateo", "Health Plan of San Mateo", "113,202"], ["COHS", "Ventura", "Gold Coast Health Plan", "202,217"], ["COHS", "Total COHS Enrollment", "", "2,176,064"], - ["Subtotal for Two-Plan, Regional Model, GMC and COHS", "", "", "10,132,022"], + ["Subtotal for Two-Plan, Regional Model, GMC and COHS", "", "", + "10,132,022"], ["PCCM", "Los Angeles", "AIDS Healthcare Foundation", "828"], ["PCCM", "San Francisco", "Family Mosaic", "25"], ["PCCM", "Total PHP Enrollment", "", "853"], @@ -2721,7 +2759,8 @@ data_lattice_shift_text_left_top = [ ], ["Blood Pressure #", "2400", "Men (≥ 18yrs)", "10%", "95%", "20%", "1728"], ["", "", "Women (≥ 18 yrs)", "", "", "", "1728"], - ["Fasting blood glucose", "2400", "Men (≥ 18 yrs)", "5%", "95%", "20%", "1825"], + ["Fasting blood glucose", "2400", "Men (≥ 18 yrs)", "5%", "95%", "20%", + "1825"], ["", "", "Women (≥ 18 yrs)", "", "", "", "1825"], [ "Knowledge &\nPractices on HTN &\nDM", @@ -2746,7 +2785,8 @@ data_lattice_shift_text_disable = [ "Sample size\nper State", ], ["Anthropometry", "", "", "", "", "", ""], - ["Clinical Examination", "2400", "", "All the available individuals", "", "", ""], + ["Clinical Examination", "2400", "", "All the available individuals", + "", "", ""], ["History of morbidity", "", "", "", "", "", ""], [ "Diet survey", @@ -2758,9 +2798,11 @@ data_lattice_shift_text_disable = [ "", ], ["", "", "Men (≥ 18yrs)", "", "", "", "1728"], - ["Blood Pressure #", "2400", "Women (≥ 18 yrs)", "10%", "95%", "20%", "1728"], + ["Blood Pressure #", "2400", "Women (≥ 18 yrs)", "10%", "95%", "20%", + "1728"], ["", "", "Men (≥ 18 yrs)", "", "", "", "1825"], - ["Fasting blood glucose", "2400", "Women (≥ 18 yrs)", "5%", "95%", "20%", "1825"], + ["Fasting blood glucose", "2400", "Women (≥ 18 yrs)", "5%", "95%", "20%", + "1825"], [ "Knowledge &\nPractices on HTN &", "2400", @@ -2785,7 +2827,8 @@ data_lattice_shift_text_right_bottom = [ ], ["Anthropometry", "", "", "", "", "", ""], ["Clinical Examination", "", "", "", "", "", ""], - ["History of morbidity", "2400", "", "", "", "", "All the available individuals"], + ["History of morbidity", "2400", "", "", "", "", + "All the available individuals"], [ "Diet survey", "1200", @@ -2796,9 +2839,11 @@ data_lattice_shift_text_right_bottom = [ "All the individuals partaking meals in the HH", ], ["", "", "Men (≥ 18yrs)", "", "", "", "1728"], - ["Blood Pressure #", "2400", "Women (≥ 18 yrs)", "10%", "95%", "20%", "1728"], + ["Blood Pressure #", "2400", "Women (≥ 18 yrs)", "10%", "95%", "20%", + "1728"], ["", "", "Men (≥ 18 yrs)", "", "", "", "1825"], - ["Fasting blood glucose", "2400", "Women (≥ 18 yrs)", "5%", "95%", "20%", "1825"], + ["Fasting blood glucose", "2400", "Women (≥ 18 yrs)", "5%", "95%", "20%", + "1825"], ["", "2400", "Men (≥ 18 yrs)", "-", "-", "-", "1728"], [ "Knowledge &\nPractices on HTN &\nDM", @@ -2820,7 +2865,7 @@ data_arabic = [ ] data_stream_layout_kwargs = [ - ["V i n s a u Ve r r e", ""], + ["V i n s a u V e r r e", ""], ["Les Blancs", "12.5CL"], ["A.O.P Côtes du Rhône", ""], ["Domaine de la Guicharde « Autour de la chapelle » 2016", "8 €"], diff --git a/tests/files/baseline_plots/test_grid_plot.png b/tests/files/baseline_plots/test_grid_plot.png index 3b835f5..0607d15 100644 Binary files a/tests/files/baseline_plots/test_grid_plot.png and b/tests/files/baseline_plots/test_grid_plot.png differ diff --git a/tests/files/baseline_plots/test_joint_plot.png b/tests/files/baseline_plots/test_joint_plot.png index e9e40ec..9f98d68 100644 Binary files a/tests/files/baseline_plots/test_joint_plot.png and b/tests/files/baseline_plots/test_joint_plot.png differ diff --git a/tests/files/baseline_plots/test_lattice_contour_plot.png b/tests/files/baseline_plots/test_lattice_contour_plot.png index a8d3326..e458b3d 100644 Binary files a/tests/files/baseline_plots/test_lattice_contour_plot.png and b/tests/files/baseline_plots/test_lattice_contour_plot.png differ diff --git a/tests/files/baseline_plots/test_line_plot.png b/tests/files/baseline_plots/test_line_plot.png index e8099ce..12c44c0 100644 Binary files a/tests/files/baseline_plots/test_line_plot.png and b/tests/files/baseline_plots/test_line_plot.png differ diff --git a/tests/files/baseline_plots/test_stream_contour_plot.png b/tests/files/baseline_plots/test_stream_contour_plot.png index a6e77f7..958ea0a 100644 Binary files a/tests/files/baseline_plots/test_stream_contour_plot.png and b/tests/files/baseline_plots/test_stream_contour_plot.png differ diff --git a/tests/files/baseline_plots/test_stream_grid_plot.png b/tests/files/baseline_plots/test_stream_grid_plot.png new file mode 100644 index 0000000..818958c Binary files /dev/null and b/tests/files/baseline_plots/test_stream_grid_plot.png differ diff --git a/tests/files/baseline_plots/test_text_plot.png b/tests/files/baseline_plots/test_text_plot.png index 8cc3825..63b5520 100644 Binary files a/tests/files/baseline_plots/test_text_plot.png and b/tests/files/baseline_plots/test_text_plot.png differ diff --git a/tests/files/baseline_plots/test_textedge_plot.png b/tests/files/baseline_plots/test_textedge_plot.png index 63fc236..1de4e9c 100644 Binary files a/tests/files/baseline_plots/test_textedge_plot.png and b/tests/files/baseline_plots/test_textedge_plot.png differ diff --git a/tests/test_cli.py b/tests/test_cli.py index d1a660f..d3b7d55 100755 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -19,10 +19,16 @@ def test_help_output(): output = result.output assert prog_name == "camelot" - assert result.output.startswith("Usage: %(prog_name)s [OPTIONS] COMMAND" % locals()) + assert result.output.startswith( + "Usage: %(prog_name)s [OPTIONS] COMMAND" % + locals() + ) assert all( v in result.output - for v in ["Options:", "--version", "--help", "Commands:", "lattice", "stream"] + for v in [ + "Options:", "--version", "--help", "Commands:", "lattice", + "stream" + ] ) @@ -120,21 +126,24 @@ def test_cli_output_format(): # json result = runner.invoke( cli, - ["--format", "json", "--output", outfile.format("json"), "stream", infile], + ["--format", "json", "--output", outfile.format("json"), "stream", + infile], ) assert result.exit_code == 0 # excel result = runner.invoke( cli, - ["--format", "excel", "--output", outfile.format("xlsx"), "stream", infile], + ["--format", "excel", "--output", outfile.format("xlsx"), "stream", + infile], ) assert result.exit_code == 0 # html result = runner.invoke( cli, - ["--format", "html", "--output", outfile.format("html"), "stream", infile], + ["--format", "html", "--output", outfile.format("html"), "stream", + infile], ) assert result.exit_code == 0 @@ -166,6 +175,10 @@ def test_cli_quiet(): assert "No tables found on page-1" in result.output result = runner.invoke( - cli, ["--quiet", "--format", "csv", "--output", outfile, "stream", infile] + cli, + [ + "--quiet", "--format", "csv", "--output", outfile, "stream", + infile + ] ) assert "No tables found on page-1" not in result.output diff --git a/tests/test_common.py b/tests/test_common.py index 6fadc9d..6395a42 100644 --- a/tests/test_common.py +++ b/tests/test_common.py @@ -11,12 +11,15 @@ from camelot.__version__ import generate_version from .data import * + testdir = os.path.dirname(os.path.abspath(__file__)) testdir = os.path.join(testdir, "files") def test_parsing_report(): - parsing_report = {"accuracy": 99.02, "whitespace": 12.24, "order": 1, "page": 1} + parsing_report = { + "accuracy": 99.02, "whitespace": 12.24, "order": 1, "page": 1 + } filename = os.path.join(testdir, "foo.pdf") tables = camelot.read_pdf(filename) @@ -28,9 +31,17 @@ def test_password(): filename = os.path.join(testdir, "health_protected.pdf") tables = camelot.read_pdf(filename, password="ownerpass", flavor="stream") +<<<<<<< HEAD assert_frame_equal(df, tables[0].df) tables = camelot.read_pdf(filename, password="userpass", flavor="stream") +======= + assert len(tables) == 1 + assert_frame_equal(df, tables[0].df) + + tables = camelot.read_pdf(filename, password="userpass", flavor="stream") + assert len(tables) == 1 +>>>>>>> Fix unit tests, lint, drop Python 2 support assert_frame_equal(df, tables[0].df) @@ -229,9 +240,9 @@ def test_repr(): tables = camelot.read_pdf(filename) assert repr(tables) == "" assert repr(tables[0]) == "" - assert ( - repr(tables[0].cells[0][0]) == "" - ) + assert \ + repr(tables[0].cells[0][0]) == \ + "" def test_pages(): @@ -239,22 +250,23 @@ def test_pages(): tables = camelot.read_pdf(url) assert repr(tables) == "" assert repr(tables[0]) == "
" - assert ( - repr(tables[0].cells[0][0]) == "" - ) + assert \ + repr(tables[0].cells[0][0]) == \ + "" tables = camelot.read_pdf(url, pages="1-end") assert repr(tables) == "" assert repr(tables[0]) == "
" - assert ( - repr(tables[0].cells[0][0]) == "" - ) + assert \ + repr(tables[0].cells[0][0]) == \ + "" tables = camelot.read_pdf(url, pages="all") assert repr(tables) == "" assert repr(tables[0]) == "
" assert ( - repr(tables[0].cells[0][0]) == "" + repr(tables[0].cells[0][0]) == + "" ) @@ -264,7 +276,8 @@ def test_url(): assert repr(tables) == "" assert repr(tables[0]) == "
" assert ( - repr(tables[0].cells[0][0]) == "" + repr(tables[0].cells[0][0]) == + "" ) @@ -284,7 +297,12 @@ def test_table_order(): return t table_list = TableList( - [_make_table(2, 1), _make_table(1, 1), _make_table(3, 4), _make_table(1, 2)] + [ + _make_table(2, 1), + _make_table(1, 1), + _make_table(3, 4), + _make_table(1, 2) + ] ) assert [(t.page, t.order) for t in sorted(table_list)] == [ diff --git a/tests/test_plotting.py b/tests/test_plotting.py index f267e29..565c68c 100644 --- a/tests/test_plotting.py +++ b/tests/test_plotting.py @@ -4,13 +4,30 @@ import os import pytest +import matplotlib + import camelot +# The version of Matplotlib has an impact on some of the tests. Unfortunately, +# we can't enforce usage of a recent version of MatplotLib without dropping +# support for Python 3.6. +# To check the version of matplotlib installed: +# pip freeze | grep matplotlib +# To force upgrade: +# pip install --upgrade --force-reinstall matplotlib +# To force usage of a Python 3.6 compatible version: +# pip install "matplotlib==2.2.5" +# This condition can be removed in favor of a version requirement bump for +# matplotlib once support for Python 3.5 is dropped. + +LEGACY_MATPLOTLIB = matplotlib.__version__ < "3.2.1" testdir = os.path.dirname(os.path.abspath(__file__)) testdir = os.path.join(testdir, "files") +@pytest.mark.skipif(LEGACY_MATPLOTLIB, + reason="depends on a recent version of MatPlotLib") @pytest.mark.mpl_image_compare( baseline_dir="files/baseline_plots", remove_text=True) def test_text_plot(): @@ -26,6 +43,15 @@ def test_grid_plot(): tables = camelot.read_pdf(filename) return camelot.plot(tables[0], kind='grid') +@pytest.mark.skipif(LEGACY_MATPLOTLIB, + reason="depends on a recent version of MatPlotLib") +@pytest.mark.mpl_image_compare( + baseline_dir="files/baseline_plots", remove_text=True) +def test_stream_grid_plot(): + filename = os.path.join(testdir, "foo.pdf") + tables = camelot.read_pdf(filename, flavor="stream") + return camelot.plot(tables[0], kind='grid') + @pytest.mark.mpl_image_compare( baseline_dir="files/baseline_plots", remove_text=True) @@ -35,6 +61,8 @@ def test_lattice_contour_plot(): return camelot.plot(tables[0], kind='contour') +@pytest.mark.skipif(LEGACY_MATPLOTLIB, + reason="depends on a recent version of MatPlotLib") @pytest.mark.mpl_image_compare( baseline_dir="files/baseline_plots", remove_text=True) def test_stream_contour_plot(): @@ -51,6 +79,8 @@ def test_line_plot(): return camelot.plot(tables[0], kind='line') +@pytest.mark.skipif(LEGACY_MATPLOTLIB, + reason="depends on a recent version of MatPlotLib") @pytest.mark.mpl_image_compare( baseline_dir="files/baseline_plots", remove_text=True) def test_joint_plot(): @@ -59,6 +89,8 @@ def test_joint_plot(): return camelot.plot(tables[0], kind='joint') +@pytest.mark.skipif(LEGACY_MATPLOTLIB, + reason="depends on a recent version of MatPlotLib") @pytest.mark.mpl_image_compare( baseline_dir="files/baseline_plots", remove_text=True) def test_textedge_plot():