Fix unit tests, lint, drop Python 2 support

Drop EOL Python 2 support. Resolve unit test discrepancies.
Update unit tests to pass in Travis across all supported Py.
Linting.
pull/153/head
Frh 2020-04-18 17:25:47 -07:00
parent 7d4c9e53c6
commit bd2aab5b2d
26 changed files with 498 additions and 276 deletions

3
.bandit 100644
View File

@ -0,0 +1,3 @@
[bandit]
# Ignore concerns about asserts, necessary for unit test code
skips: B101,B102

3
.gitignore vendored
View File

@ -4,6 +4,7 @@ __pycache__/
build/ build/
dist/ dist/
prof/
*.egg-info/ *.egg-info/
.eggs/ .eggs/
.coverage .coverage
@ -17,3 +18,5 @@ htmlcov/
# vscode # vscode
.vscode .vscode
.DS_Store

View File

@ -1,4 +1,3 @@
sudo: true
language: python language: python
cache: pip cache: pip
addons: addons:
@ -8,10 +7,6 @@ install:
- make install - make install
jobs: jobs:
include: include:
- stage: test
script:
- make test
python: '2.7'
- stage: test - stage: test
script: script:
- make test - make test

View File

@ -38,7 +38,7 @@ class TextEdge(object):
intersections: int intersections: int
Number of intersections with horizontal text rows. Number of intersections with horizontal text rows.
is_valid: bool is_valid: bool
A text edge is valid if it intersections with at least A text edge is valid if it intersects with at least
TEXTEDGE_REQUIRED_ELEMENTS horizontal text rows. TEXTEDGE_REQUIRED_ELEMENTS horizontal text rows.
""" """
@ -65,7 +65,8 @@ class TextEdge(object):
the is_valid attribute. the is_valid attribute.
""" """
if np.isclose(self.y0, y0, atol=edge_tol): if np.isclose(self.y0, y0, atol=edge_tol):
self.x = (self.intersections * self.x + x) / float(self.intersections + 1) self.x = (self.intersections * self.x + x) / \
float(self.intersections + 1)
self.y0 = y0 self.y0 = y0
self.intersections += 1 self.intersections += 1
# a textedge is valid only if it extends uninterrupted # a textedge is valid only if it extends uninterrupted
@ -141,13 +142,16 @@ class TextEdges(object):
""" """
intersections_sum = { intersections_sum = {
"left": sum( "left": sum(
te.intersections for te in self._textedges["left"] if te.is_valid te.intersections for te in self._textedges["left"]
if te.is_valid
), ),
"right": sum( "right": sum(
te.intersections for te in self._textedges["right"] if te.is_valid te.intersections for te in self._textedges["right"]
if te.is_valid
), ),
"middle": sum( "middle": sum(
te.intersections for te in self._textedges["middle"] if te.is_valid te.intersections for te in self._textedges["middle"]
if te.is_valid
), ),
} }
@ -292,7 +296,10 @@ class Cell(object):
def __repr__(self): def __repr__(self):
return "<Cell x1={} y1={} x2={} y2={}>".format( return "<Cell x1={} y1={} x2={} y2={}>".format(
round(self.x1, 2), round(self.y1, 2), round(self.x2, 2), round(self.y2, 2) round(self.x1, 2),
round(self.y1, 2),
round(self.x2, 2),
round(self.y2, 2)
) )
@property @property
@ -342,7 +349,9 @@ class Table(object):
def __init__(self, cols, rows): def __init__(self, cols, rows):
self.cols = cols self.cols = cols
self.rows = rows self.rows = rows
self.cells = [[Cell(c[0], r[1], c[1], r[0]) for c in cols] for r in rows] self.cells = [
[Cell(c[0], r[1], c[1], r[0]) for c in cols] for r in rows
]
self.df = None self.df = None
self.shape = (0, 0) self.shape = (0, 0)
self.accuracy = 0 self.accuracy = 0
@ -579,7 +588,8 @@ class Table(object):
Output filepath. Output filepath.
""" """
kw = {"encoding": "utf-8", "index": False, "header": False, "quoting": 1} kw = {"encoding": "utf-8", "index": False, "header": False,
"quoting": 1}
kw.update(kwargs) kw.update(kwargs)
self.df.to_csv(path, **kw) self.df.to_csv(path, **kw)
@ -616,6 +626,7 @@ class Table(object):
"encoding": "utf-8", "encoding": "utf-8",
} }
kw.update(kwargs) kw.update(kwargs)
# pylint: disable=abstract-class-instantiated
writer = pd.ExcelWriter(path) writer = pd.ExcelWriter(path)
self.df.to_excel(writer, **kw) self.df.to_excel(writer, **kw)
writer.save() writer.save()
@ -692,7 +703,8 @@ class TableList(object):
ext = kwargs.get("ext") ext = kwargs.get("ext")
for table in self._tables: for table in self._tables:
filename = os.path.join( filename = os.path.join(
"{}-page-{}-table-{}{}".format(root, table.page, table.order, ext) "{}-page-{}-table-{}{}".format(root, table.page, table.order,
ext)
) )
filepath = os.path.join(dirname, filename) filepath = os.path.join(dirname, filename)
to_format = self._format_func(table, f) to_format = self._format_func(table, f)
@ -707,7 +719,10 @@ class TableList(object):
with zipfile.ZipFile(zipname, "w", allowZip64=True) as z: with zipfile.ZipFile(zipname, "w", allowZip64=True) as z:
for table in self._tables: for table in self._tables:
filename = os.path.join( filename = os.path.join(
"{}-page-{}-table-{}{}".format(root, table.page, table.order, ext) "{}-page-{}-table-{}{}".format(root,
table.page,
table.order,
ext)
) )
filepath = os.path.join(dirname, filename) filepath = os.path.join(dirname, filename)
z.write(filepath, os.path.basename(filepath)) z.write(filepath, os.path.basename(filepath))
@ -739,10 +754,12 @@ class TableList(object):
self._compress_dir(**kwargs) self._compress_dir(**kwargs)
elif f == "excel": elif f == "excel":
filepath = os.path.join(dirname, basename) filepath = os.path.join(dirname, basename)
# pylint: disable=abstract-class-instantiated
writer = pd.ExcelWriter(filepath) writer = pd.ExcelWriter(filepath)
for table in self._tables: for table in self._tables:
sheet_name = "page-{}-table-{}".format(table.page, table.order) sheet_name = "page-{}-table-{}".format(table.page, table.order)
table.df.to_excel(writer, sheet_name=sheet_name, encoding="utf-8") table.df.to_excel(writer, sheet_name=sheet_name,
encoding="utf-8")
writer.save() writer.save()
if compress: if compress:
zipname = os.path.join(os.path.dirname(path), root) + ".zip" zipname = os.path.join(os.path.dirname(path), root) + ".zip"

View File

@ -113,14 +113,20 @@ class PDFHandler(object):
outfile.addPage(p) outfile.addPage(p)
with open(fpath, "wb") as f: with open(fpath, "wb") as f:
outfile.write(f) outfile.write(f)
layout, dim = get_page_layout(fpath) layout, __ = get_page_layout(fpath)
# fix rotated PDF # fix rotated PDF
chars = get_text_objects(layout, ltype="char") chars = get_text_objects(layout, ltype="char")
horizontal_text = get_text_objects(layout, ltype="horizontal_text") horizontal_text = get_text_objects(layout, ltype="horizontal_text")
vertical_text = get_text_objects(layout, ltype="vertical_text") vertical_text = get_text_objects(layout, ltype="vertical_text")
rotation = get_rotation(chars, horizontal_text, vertical_text) rotation = get_rotation(chars, horizontal_text, vertical_text)
if rotation != "": if rotation != "":
fpath_new = "".join([froot.replace("page", "p"), "_rotated", fext]) fpath_new = "".join(
[
froot.replace("page", "p"),
"_rotated",
fext
]
)
os.rename(fpath, fpath_new) os.rename(fpath, fpath_new)
infile = PdfFileReader(open(fpath_new, "rb"), strict=False) infile = PdfFileReader(open(fpath_new, "rb"), strict=False)
if infile.isEncrypted: if infile.isEncrypted:
@ -136,7 +142,8 @@ class PDFHandler(object):
outfile.write(f) outfile.write(f)
def parse( def parse(
self, flavor="lattice", suppress_stdout=False, layout_kwargs={}, **kwargs self, flavor="lattice", suppress_stdout=False, layout_kwargs=None,
**kwargs
): ):
"""Extracts tables by calling parser.get_tables on all single """Extracts tables by calling parser.get_tables on all single
page PDFs. page PDFs.
@ -149,7 +156,7 @@ class PDFHandler(object):
suppress_stdout : str (default: False) suppress_stdout : str (default: False)
Suppress logs and warnings. Suppress logs and warnings.
layout_kwargs : dict, optional (default: {}) layout_kwargs : dict, optional (default: {})
A dict of `pdfminer.layout.LAParams <https://github.com/euske/pdfminer/blob/master/pdfminer/layout.py#L33>`_ kwargs. A dict of `pdfminer.layout.LAParams <https://github.com/euske/pdfminer/blob/master/pdfminer/layout.py#L33>`_ kwargs. # noqa
kwargs : dict kwargs : dict
See camelot.read_pdf kwargs. See camelot.read_pdf kwargs.
@ -159,17 +166,21 @@ class PDFHandler(object):
List of tables found in PDF. List of tables found in PDF.
""" """
layout_kwargs = layout_kwargs or {}
tables = [] tables = []
with TemporaryDirectory() as tempdir: with TemporaryDirectory() as tempdir:
for p in self.pages: for p in self.pages:
self._save_page(self.filepath, p, tempdir) self._save_page(self.filepath, p, tempdir)
pages = [ pages = [
os.path.join(tempdir, "page-{0}.pdf".format(p)) for p in self.pages os.path.join(tempdir, "page-{0}.pdf".format(p))
for p in self.pages
] ]
parser = Lattice(**kwargs) if flavor == "lattice" else Stream(**kwargs) parser = Lattice(**kwargs) \
if flavor == "lattice" else Stream(**kwargs)
for p in pages: for p in pages:
t = parser.extract_tables( t = parser.extract_tables(
p, suppress_stdout=suppress_stdout, layout_kwargs=layout_kwargs p, suppress_stdout=suppress_stdout,
layout_kwargs=layout_kwargs
) )
tables.extend(t) tables.extend(t)
return TableList(sorted(tables)) return TableList(sorted(tables))

View File

@ -12,7 +12,7 @@ def read_pdf(
password=None, password=None,
flavor="lattice", flavor="lattice",
suppress_stdout=False, suppress_stdout=False,
layout_kwargs={}, layout_kwargs=None,
**kwargs **kwargs
): ):
"""Read PDF and return extracted tables. """Read PDF and return extracted tables.
@ -80,16 +80,16 @@ def read_pdf(
Size of a pixel neighborhood that is used to calculate a Size of a pixel neighborhood that is used to calculate a
threshold value for the pixel: 3, 5, 7, and so on. threshold value for the pixel: 3, 5, 7, and so on.
For more information, refer `OpenCV's adaptiveThreshold <https://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html#adaptivethreshold>`_. For more information, refer `OpenCV's adaptiveThreshold <https://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html#adaptivethreshold>`_. # noqa
threshold_constant* : int, optional (default: -2) threshold_constant* : int, optional (default: -2)
Constant subtracted from the mean or weighted mean. Constant subtracted from the mean or weighted mean.
Normally, it is positive but may be zero or negative as well. Normally, it is positive but may be zero or negative as well.
For more information, refer `OpenCV's adaptiveThreshold <https://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html#adaptivethreshold>`_. For more information, refer `OpenCV's adaptiveThreshold <https://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html#adaptivethreshold>`_. # noqa
iterations* : int, optional (default: 0) iterations* : int, optional (default: 0)
Number of times for erosion/dilation is applied. Number of times for erosion/dilation is applied.
For more information, refer `OpenCV's dilate <https://docs.opencv.org/2.4/modules/imgproc/doc/filtering.html#dilate>`_. For more information, refer `OpenCV's dilate <https://docs.opencv.org/2.4/modules/imgproc/doc/filtering.html#dilate>`_. # noqa
resolution* : int, optional (default: 300) resolution* : int, optional (default: 300)
Resolution used for PDF to PNG conversion. Resolution used for PDF to PNG conversion.
@ -98,6 +98,7 @@ def read_pdf(
tables : camelot.core.TableList tables : camelot.core.TableList
""" """
layout_kwargs = layout_kwargs or {}
if flavor not in ["lattice", "stream"]: if flavor not in ["lattice", "stream"]:
raise NotImplementedError( raise NotImplementedError(
"Unknown flavor specified." " Use either 'lattice' or 'stream'" "Unknown flavor specified." " Use either 'lattice' or 'stream'"

View File

@ -12,9 +12,18 @@ class BaseParser(object):
def _generate_layout(self, filename, layout_kwargs): def _generate_layout(self, filename, layout_kwargs):
self.filename = filename self.filename = filename
self.layout_kwargs = layout_kwargs self.layout_kwargs = layout_kwargs
self.layout, self.dimensions = get_page_layout(filename, **layout_kwargs) self.layout, self.dimensions = get_page_layout(
filename,
**layout_kwargs
)
self.images = get_text_objects(self.layout, ltype="image") self.images = get_text_objects(self.layout, ltype="image")
self.horizontal_text = get_text_objects(self.layout, ltype="horizontal_text") self.horizontal_text = get_text_objects(
self.vertical_text = get_text_objects(self.layout, ltype="vertical_text") self.layout,
ltype="horizontal_text"
)
self.vertical_text = get_text_objects(
self.layout,
ltype="vertical_text"
)
self.pdf_width, self.pdf_height = self.dimensions self.pdf_width, self.pdf_height = self.dimensions
self.rootname, __ = os.path.splitext(self.filename) self.rootname, __ = os.path.splitext(self.filename)

View File

@ -2,14 +2,10 @@
from __future__ import division from __future__ import division
import os import os
import sys
import copy import copy
import locale
import logging import logging
import warnings import warnings
import subprocess
import numpy as np
import pandas as pd import pandas as pd
from .base import BaseParser from .base import BaseParser
@ -80,7 +76,7 @@ class Lattice(BaseParser):
Size of a pixel neighborhood that is used to calculate a Size of a pixel neighborhood that is used to calculate a
threshold value for the pixel: 3, 5, 7, and so on. threshold value for the pixel: 3, 5, 7, and so on.
For more information, refer `OpenCV's adaptiveThreshold <https://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html#adaptivethreshold>`_. For more information, refer `OpenCV's adaptiveThreshold <https://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html#adaptivethreshold>`_. # noqa
threshold_constant : int, optional (default: -2) threshold_constant : int, optional (default: -2)
Constant subtracted from the mean or weighted mean. Constant subtracted from the mean or weighted mean.
Normally, it is positive but may be zero or negative as well. Normally, it is positive but may be zero or negative as well.
@ -102,7 +98,7 @@ class Lattice(BaseParser):
process_background=False, process_background=False,
line_scale=15, line_scale=15,
copy_text=None, copy_text=None,
shift_text=["l", "t"], shift_text=None,
split_text=False, split_text=False,
flag_size=False, flag_size=False,
strip_text="", strip_text="",
@ -114,6 +110,7 @@ class Lattice(BaseParser):
resolution=300, resolution=300,
**kwargs **kwargs
): ):
shift_text = shift_text or ["l", "t"]
self.table_regions = table_regions self.table_regions = table_regions
self.table_areas = table_areas self.table_areas = table_areas
self.process_background = process_background self.process_background = process_background
@ -217,8 +214,7 @@ class Lattice(BaseParser):
) )
gs_call = gs_call.encode().split() gs_call = gs_call.encode().split()
null = open(os.devnull, "wb") null = open(os.devnull, "wb")
with Ghostscript(*gs_call, stdout=null) as gs: Ghostscript(*gs_call, stdout=null)
pass
null.close() null.close()
def _generate_table_bbox(self): def _generate_table_bbox(self):
@ -247,7 +243,8 @@ class Lattice(BaseParser):
image_height_scaler = image_height / float(self.pdf_height) image_height_scaler = image_height / float(self.pdf_height)
pdf_width_scaler = self.pdf_width / float(image_width) pdf_width_scaler = self.pdf_width / float(image_width)
pdf_height_scaler = self.pdf_height / float(image_height) pdf_height_scaler = self.pdf_height / float(image_height)
image_scalers = (image_width_scaler, image_height_scaler, self.pdf_height) image_scalers = (image_width_scaler,
image_height_scaler, self.pdf_height)
pdf_scalers = (pdf_width_scaler, pdf_height_scaler, image_height) pdf_scalers = (pdf_width_scaler, pdf_height_scaler, image_height)
if self.table_areas is None: if self.table_areas is None:
@ -291,7 +288,11 @@ class Lattice(BaseParser):
self.table_bbox_unscaled = copy.deepcopy(table_bbox) self.table_bbox_unscaled = copy.deepcopy(table_bbox)
self.table_bbox, self.vertical_segments, self.horizontal_segments = scale_image( [
self.table_bbox,
self.vertical_segments,
self.horizontal_segments
] = scale_image(
table_bbox, vertical_segments, horizontal_segments, pdf_scalers table_bbox, vertical_segments, horizontal_segments, pdf_scalers
) )
@ -315,7 +316,10 @@ class Lattice(BaseParser):
rows.extend([tk[1], tk[3]]) rows.extend([tk[1], tk[3]])
# sort horizontal and vertical segments # sort horizontal and vertical segments
cols = merge_close_lines(sorted(cols), line_tol=self.line_tol) cols = merge_close_lines(sorted(cols), line_tol=self.line_tol)
rows = merge_close_lines(sorted(rows, reverse=True), line_tol=self.line_tol) rows = merge_close_lines(
sorted(rows, reverse=True),
line_tol=self.line_tol
)
# make grid using x and y coord of shortlisted rows and cols # make grid using x and y coord of shortlisted rows and cols
cols = [(cols[i], cols[i + 1]) for i in range(0, len(cols) - 1)] cols = [(cols[i], cols[i + 1]) for i in range(0, len(cols) - 1)]
rows = [(rows[i], rows[i + 1]) for i in range(0, len(rows) - 1)] rows = [(rows[i], rows[i + 1]) for i in range(0, len(rows) - 1)]
@ -359,7 +363,10 @@ class Lattice(BaseParser):
accuracy = compute_accuracy([[100, pos_errors]]) accuracy = compute_accuracy([[100, pos_errors]])
if self.copy_text is not None: if self.copy_text is not None:
table = Lattice._copy_spanning_text(table, copy_text=self.copy_text) table = Lattice._copy_spanning_text(
table,
copy_text=self.copy_text
)
data = table.data data = table.data
table.df = pd.DataFrame(data) table.df = pd.DataFrame(data)
@ -383,20 +390,28 @@ class Lattice(BaseParser):
return table return table
def extract_tables(self, filename, suppress_stdout=False, layout_kwargs={}): def extract_tables(
self,
filename,
suppress_stdout=False,
layout_kwargs=None
):
layout_kwargs = layout_kwargs or {}
self._generate_layout(filename, layout_kwargs) self._generate_layout(filename, layout_kwargs)
rootname = os.path.basename(self.rootname)
if not suppress_stdout: if not suppress_stdout:
logger.info("Processing {}".format(os.path.basename(self.rootname))) logger.info("Processing {rootname}".format(rootname=rootname))
if not self.horizontal_text: if not self.horizontal_text:
if self.images: if self.images:
warnings.warn( warnings.warn(
"{} is image-based, camelot only works on" "{rootname} is image-based, "
" text-based pages.".format(os.path.basename(self.rootname)) "camelot only works on text-based pages."
.format(rootname=rootname)
) )
else: else:
warnings.warn( warnings.warn(
"No tables found on {}".format(os.path.basename(self.rootname)) "No tables found on {rootname}".format(rootname=rootname)
) )
return [] return []
@ -408,8 +423,10 @@ class Lattice(BaseParser):
for table_idx, tk in enumerate( for table_idx, tk in enumerate(
sorted(self.table_bbox.keys(), key=lambda x: x[1], reverse=True) sorted(self.table_bbox.keys(), key=lambda x: x[1], reverse=True)
): ):
cols, rows, v_s, h_s = self._generate_columns_and_rows(table_idx, tk) cols, rows, v_s, h_s = self._generate_columns_and_rows(
table = self._generate_table(table_idx, cols, rows, v_s=v_s, h_s=h_s) table_idx, tk)
table = self._generate_table(
table_idx, cols, rows, v_s=v_s, h_s=h_s)
table._bbox = tk table._bbox = tk
_tables.append(table) _tables.append(table)

View File

@ -10,7 +10,8 @@ import pandas as pd
from .base import BaseParser from .base import BaseParser
from ..core import TextEdges, Table from ..core import TextEdges, Table
from ..utils import text_in_bbox, get_table_index, compute_accuracy, compute_whitespace from ..utils import (text_in_bbox, get_table_index, compute_accuracy,
compute_whitespace)
logger = logging.getLogger("camelot") logger = logging.getLogger("camelot")
@ -70,6 +71,9 @@ class Stream(BaseParser):
): ):
self.table_regions = table_regions self.table_regions = table_regions
self.table_areas = table_areas self.table_areas = table_areas
self.table_bbox = None
self.t_bbox = None
self.textedges = []
self.columns = columns self.columns = columns
self._validate_columns() self._validate_columns()
self.split_text = split_text self.split_text = split_text
@ -95,10 +99,10 @@ class Stream(BaseParser):
Tuple (x0, y0, x1, y1) in pdf coordinate space. Tuple (x0, y0, x1, y1) in pdf coordinate space.
""" """
xmin = min([t.x0 for direction in t_bbox for t in t_bbox[direction]]) xmin = min(t.x0 for direction in t_bbox for t in t_bbox[direction])
ymin = min([t.y0 for direction in t_bbox for t in t_bbox[direction]]) ymin = min(t.y0 for direction in t_bbox for t in t_bbox[direction])
xmax = max([t.x1 for direction in t_bbox for t in t_bbox[direction]]) xmax = max(t.x1 for direction in t_bbox for t in t_bbox[direction])
ymax = max([t.y1 for direction in t_bbox for t in t_bbox[direction]]) ymax = max(t.y1 for direction in t_bbox for t in t_bbox[direction])
text_bbox = (xmin, ymin, xmax, ymax) text_bbox = (xmin, ymin, xmax, ymax)
return text_bbox return text_bbox
@ -119,21 +123,25 @@ class Stream(BaseParser):
Two-dimensional list of text objects grouped into rows. Two-dimensional list of text objects grouped into rows.
""" """
row_y = 0 row_y = None
rows = [] rows = []
temp = [] temp = []
for t in text: non_empty_text = [t for t in text if t.get_text().strip()]
for t in non_empty_text:
# is checking for upright necessary? # is checking for upright necessary?
# if t.get_text().strip() and all([obj.upright for obj in t._objs if # if t.get_text().strip() and all([obj.upright \
# type(obj) is LTChar]): # for obj in t._objs
if t.get_text().strip(): # if type(obj) is LTChar]):
if not np.isclose(row_y, t.y0, atol=row_tol): if row_y is None:
rows.append(sorted(temp, key=lambda t: t.x0)) row_y = t.y0
temp = [] elif not np.isclose(row_y, t.y0, atol=row_tol):
row_y = t.y0 rows.append(sorted(temp, key=lambda t: t.x0))
temp.append(t) temp = []
# We update the row's bottom as we go, to be forgiving if there
# is a gradual change across multiple columns.
row_y = t.y0
temp.append(t)
rows.append(sorted(temp, key=lambda t: t.x0)) rows.append(sorted(temp, key=lambda t: t.x0))
__ = rows.pop(0) # TODO: hacky
return rows return rows
@staticmethod @staticmethod
@ -170,7 +178,8 @@ class Stream(BaseParser):
merged.append(higher) merged.append(higher)
elif column_tol < 0: elif column_tol < 0:
if higher[0] <= lower[1]: if higher[0] <= lower[1]:
if np.isclose(higher[0], lower[1], atol=abs(column_tol)): if np.isclose(higher[0], lower[1],
atol=abs(column_tol)):
merged.append(higher) merged.append(higher)
else: else:
upper_bound = max(lower[1], higher[1]) upper_bound = max(lower[1], higher[1])
@ -198,10 +207,13 @@ class Stream(BaseParser):
""" """
row_mids = [ row_mids = [
sum([(t.y0 + t.y1) / 2 for t in r]) / len(r) if len(r) > 0 else 0 sum((t.y0 + t.y1) / 2 for t in r) / len(r) if len(r) > 0 else 0
for r in rows_grouped for r in rows_grouped
] ]
rows = [(row_mids[i] + row_mids[i - 1]) / 2 for i in range(1, len(row_mids))] rows = [
(row_mids[i] + row_mids[i - 1]) / 2
for i in range(1, len(row_mids))
]
rows.insert(0, text_y_max) rows.insert(0, text_y_max)
rows.append(text_y_min) rows.append(text_y_min)
rows = [(rows[i], rows[i + 1]) for i in range(0, len(rows) - 1)] rows = [(rows[i], rows[i + 1]) for i in range(0, len(rows) - 1)]
@ -230,7 +242,9 @@ class Stream(BaseParser):
text = Stream._group_rows(text, row_tol=row_tol) text = Stream._group_rows(text, row_tol=row_tol)
elements = [len(r) for r in text] elements = [len(r) for r in text]
new_cols = [ new_cols = [
(t.x0, t.x1) for r in text if len(r) == max(elements) for t in r (t.x0, t.x1)
for r in text if len(r) == max(elements)
for t in r
] ]
cols.extend(Stream._merge_columns(sorted(new_cols))) cols.extend(Stream._merge_columns(sorted(new_cols)))
return cols return cols
@ -262,12 +276,13 @@ class Stream(BaseParser):
def _validate_columns(self): def _validate_columns(self):
if self.table_areas is not None and self.columns is not None: if self.table_areas is not None and self.columns is not None:
if len(self.table_areas) != len(self.columns): if len(self.table_areas) != len(self.columns):
raise ValueError("Length of table_areas and columns" " should be equal") raise ValueError("Length of table_areas and columns"
" should be equal")
def _nurminen_table_detection(self, textlines): def _nurminen_table_detection(self, textlines):
"""A general implementation of the table detection algorithm """A general implementation of the table detection algorithm
described by Anssi Nurminen's master's thesis. described by Anssi Nurminen's master's thesis.
Link: https://dspace.cc.tut.fi/dpub/bitstream/handle/123456789/21520/Nurminen.pdf?sequence=3 Link: https://dspace.cc.tut.fi/dpub/bitstream/handle/123456789/21520/Nurminen.pdf?sequence=3 # noqa
Assumes that tables are situated relatively far apart Assumes that tables are situated relatively far apart
vertically. vertically.
@ -284,7 +299,7 @@ class Stream(BaseParser):
# guess table areas using textlines and relevant edges # guess table areas using textlines and relevant edges
table_bbox = textedges.get_table_areas(textlines, relevant_textedges) table_bbox = textedges.get_table_areas(textlines, relevant_textedges)
# treat whole page as table area if no table areas found # treat whole page as table area if no table areas found
if not len(table_bbox): if not table_bbox:
table_bbox = {(0, 0, self.pdf_width, self.pdf_height): None} table_bbox = {(0, 0, self.pdf_width, self.pdf_height): None}
return table_bbox return table_bbox
@ -302,7 +317,8 @@ class Stream(BaseParser):
y1 = float(y1) y1 = float(y1)
x2 = float(x2) x2 = float(x2)
y2 = float(y2) y2 = float(y2)
region_text = text_in_bbox((x1, y2, x2, y1), self.horizontal_text) region_text = text_in_bbox(
(x1, y2, x2, y1), self.horizontal_text)
hor_text.extend(region_text) hor_text.extend(region_text)
# find tables based on nurminen's detection algorithm # find tables based on nurminen's detection algorithm
table_bbox = self._nurminen_table_detection(hor_text) table_bbox = self._nurminen_table_detection(hor_text)
@ -328,8 +344,10 @@ class Stream(BaseParser):
self.t_bbox = t_bbox self.t_bbox = t_bbox
text_x_min, text_y_min, text_x_max, text_y_max = self._text_bbox(self.t_bbox) text_x_min, text_y_min, text_x_max, text_y_max = \
rows_grouped = self._group_rows(self.t_bbox["horizontal"], row_tol=self.row_tol) self._text_bbox(self.t_bbox)
rows_grouped = self._group_rows(
self.t_bbox["horizontal"], row_tol=self.row_tol)
rows = self._join_rows(rows_grouped, text_y_max, text_y_min) rows = self._join_rows(rows_grouped, text_y_max, text_y_min)
elements = [len(r) for r in rows_grouped] elements = [len(r) for r in rows_grouped]
@ -354,14 +372,23 @@ class Stream(BaseParser):
# see if the list contains elements, if yes, then use # see if the list contains elements, if yes, then use
# the mode after removing 1s # the mode after removing 1s
elements = list(filter(lambda x: x != 1, elements)) elements = list(filter(lambda x: x != 1, elements))
if len(elements): if elements:
ncols = max(set(elements), key=elements.count) ncols = max(set(elements), key=elements.count)
else: else:
warnings.warn( warnings.warn(
"No tables found in table area {}".format(table_idx + 1) "No tables found in table area {}"
.format(table_idx + 1)
) )
cols = [(t.x0, t.x1) for r in rows_grouped if len(r) == ncols for t in r] cols = [
cols = self._merge_columns(sorted(cols), column_tol=self.column_tol) (t.x0, t.x1)
for r in rows_grouped
if len(r) == ncols
for t in r
]
cols = self._merge_columns(
sorted(cols),
column_tol=self.column_tol
)
inner_text = [] inner_text = []
for i in range(1, len(cols)): for i in range(1, len(cols)):
left = cols[i - 1][1] left = cols[i - 1][1]
@ -431,23 +458,30 @@ class Stream(BaseParser):
return table return table
def extract_tables(self, filename, suppress_stdout=False, layout_kwargs={}): def extract_tables(self, filename, suppress_stdout=False,
layout_kwargs=None):
layout_kwargs = layout_kwargs or {}
self._generate_layout(filename, layout_kwargs) self._generate_layout(filename, layout_kwargs)
if not suppress_stdout: if not suppress_stdout:
logger.info("Processing {}".format(os.path.basename(self.rootname))) logger.info("Processing {}".format(
os.path.basename(self.rootname)))
if not self.horizontal_text: if not self.horizontal_text:
if self.images: if self.images:
warnings.warn( warnings.warn(
"{} is image-based, camelot only works on" "{} is image-based, camelot only works on"
" text-based pages.".format(os.path.basename(self.rootname)) " text-based pages.".format(
os.path.basename(self.rootname))
) )
else: else:
warnings.warn( warnings.warn(
"No tables found on {}".format(os.path.basename(self.rootname)) "No tables found on {}".format(
os.path.basename(self.rootname))
) )
return [] return []
# Identify plausible areas within the doc where tables lie,
# populate table_bbox keys with these areas.
self._generate_table_bbox() self._generate_table_bbox()
_tables = [] _tables = []

View File

@ -37,7 +37,7 @@ class PlotMethods(object):
raise NotImplementedError( raise NotImplementedError(
"Lattice flavor does not support kind='{}'".format(kind) "Lattice flavor does not support kind='{}'".format(kind)
) )
elif table.flavor == "stream" and kind in ["joint", "line"]: elif table.flavor == "stream" and kind in ["line"]:
raise NotImplementedError( raise NotImplementedError(
"Stream flavor does not support kind='{}'".format(kind) "Stream flavor does not support kind='{}'".format(kind)
) )
@ -64,7 +64,13 @@ class PlotMethods(object):
for t in table._text: for t in table._text:
xs.extend([t[0], t[2]]) xs.extend([t[0], t[2]])
ys.extend([t[1], t[3]]) ys.extend([t[1], t[3]])
ax.add_patch(patches.Rectangle((t[0], t[1]), t[2] - t[0], t[3] - t[1])) ax.add_patch(
patches.Rectangle(
(t[0], t[1]),
t[2] - t[0],
t[3] - t[1]
)
)
ax.set_xlim(min(xs) - 10, max(xs) + 10) ax.set_xlim(min(xs) - 10, max(xs) + 10)
ax.set_ylim(min(ys) - 10, max(ys) + 10) ax.set_ylim(min(ys) - 10, max(ys) + 10)
return fig return fig
@ -132,7 +138,8 @@ class PlotMethods(object):
for t in table_bbox.keys(): for t in table_bbox.keys():
ax.add_patch( ax.add_patch(
patches.Rectangle( patches.Rectangle(
(t[0], t[1]), t[2] - t[0], t[3] - t[1], fill=False, color="red" (t[0], t[1]), t[2] - t[0], t[3] - t[1],
fill=False, color="red"
) )
) )
if not _FOR_LATTICE: if not _FOR_LATTICE:
@ -164,7 +171,10 @@ class PlotMethods(object):
xs.extend([t[0], t[2]]) xs.extend([t[0], t[2]])
ys.extend([t[1], t[3]]) ys.extend([t[1], t[3]])
ax.add_patch( ax.add_patch(
patches.Rectangle((t[0], t[1]), t[2] - t[0], t[3] - t[1], color="blue") patches.Rectangle(
(t[0], t[1]), t[2] - t[0], t[3] - t[1],
color="blue"
)
) )
ax.set_xlim(min(xs) - 10, max(xs) + 10) ax.set_xlim(min(xs) - 10, max(xs) + 10)
ax.set_ylim(min(ys) - 10, max(ys) + 10) ax.set_ylim(min(ys) - 10, max(ys) + 10)

View File

@ -30,6 +30,9 @@ from pdfminer.layout import (
) )
# pylint: disable=import-error
# PyLint will evaluate both branches, and will necessarily complain about one
# of them.
PY3 = sys.version_info[0] >= 3 PY3 = sys.version_info[0] >= 3
if PY3: if PY3:
from urllib.request import urlopen from urllib.request import urlopen
@ -310,7 +313,8 @@ def get_rotation(chars, horizontal_text, vertical_text):
if hlen < vlen: if hlen < vlen:
clockwise = sum(t.matrix[1] < 0 and t.matrix[2] > 0 for t in chars) clockwise = sum(t.matrix[1] < 0 and t.matrix[2] > 0 for t in chars)
anticlockwise = sum(t.matrix[1] > 0 and t.matrix[2] < 0 for t in chars) anticlockwise = sum(t.matrix[1] > 0 and t.matrix[2] < 0 for t in chars)
rotation = "anticlockwise" if clockwise < anticlockwise else "clockwise" rotation = "anticlockwise" if clockwise < anticlockwise \
else "clockwise"
return rotation return rotation
@ -341,12 +345,16 @@ def segments_in_bbox(bbox, v_segments, h_segments):
v_s = [ v_s = [
v v
for v in v_segments for v in v_segments
if v[1] > lb[1] - 2 and v[3] < rt[1] + 2 and lb[0] - 2 <= v[0] <= rt[0] + 2 if v[1] > lb[1] - 2 and
v[3] < rt[1] + 2 and
lb[0] - 2 <= v[0] <= rt[0] + 2
] ]
h_s = [ h_s = [
h h
for h in h_segments for h in h_segments
if h[0] > lb[0] - 2 and h[2] < rt[0] + 2 and lb[1] - 2 <= h[1] <= rt[1] + 2 if h[0] > lb[0] - 2 and
h[2] < rt[0] + 2 and
lb[1] - 2 <= h[1] <= rt[1] + 2
] ]
return v_s, h_s return v_s, h_s
@ -464,10 +472,10 @@ def flag_font_size(textline, direction, strip_text=""):
for t in textline for t in textline
if not isinstance(t, LTAnno) if not isinstance(t, LTAnno)
] ]
l = [np.round(size, decimals=6) for text, size in d] text_sizes = [np.round(size, decimals=6) for text, size in d]
if len(set(l)) > 1: if len(set(text_sizes)) > 1:
flist = [] flist = []
min_size = min(l) min_size = min(text_sizes)
for key, chars in groupby(d, itemgetter(1)): for key, chars in groupby(d, itemgetter(1)):
if key == min_size: if key == min_size:
fchars = [t[0] for t in chars] fchars = [t[0] for t in chars]
@ -511,7 +519,6 @@ def split_textline(table, textline, direction, flag_size=False, strip_text=""):
of row/column and text is the an lttextline substring. of row/column and text is the an lttextline substring.
""" """
idx = 0
cut_text = [] cut_text = []
bbox = textline.bbox bbox = textline.bbox
try: try:
@ -528,7 +535,9 @@ def split_textline(table, textline, direction, flag_size=False, strip_text=""):
] ]
r = r_idx[0] r = r_idx[0]
x_cuts = [ x_cuts = [
(c, table.cells[r][c].x2) for c in x_overlap if table.cells[r][c].right (c, table.cells[r][c].x2)
for c in x_overlap
if table.cells[r][c].right
] ]
if not x_cuts: if not x_cuts:
x_cuts = [(x_overlap[0], table.cells[r][-1].x2)] x_cuts = [(x_overlap[0], table.cells[r][-1].x2)]
@ -561,7 +570,9 @@ def split_textline(table, textline, direction, flag_size=False, strip_text=""):
] ]
c = c_idx[0] c = c_idx[0]
y_cuts = [ y_cuts = [
(r, table.cells[r][c].y1) for r in y_overlap if table.cells[r][c].bottom (r, table.cells[r][c].y1)
for r in y_overlap
if table.cells[r][c].bottom
] ]
if not y_cuts: if not y_cuts:
y_cuts = [(y_overlap[0], table.cells[-1][c].y1)] y_cuts = [(y_overlap[0], table.cells[-1][c].y1)]
@ -644,9 +655,8 @@ def get_table_index(
""" """
r_idx, c_idx = [-1] * 2 r_idx, c_idx = [-1] * 2
for r in range(len(table.rows)): for r in range(len(table.rows)):
if (t.y0 + t.y1) / 2.0 < table.rows[r][0] and (t.y0 + t.y1) / 2.0 > table.rows[ if (t.y0 + t.y1) / 2.0 < table.rows[r][0] and \
r (t.y0 + t.y1) / 2.0 > table.rows[r][1]:
][1]:
lt_col_overlap = [] lt_col_overlap = []
for c in table.cols: for c in table.cols:
if c[0] <= t.x1 and c[1] >= t.x0: if c[0] <= t.x1 and c[1] >= t.x0:
@ -681,7 +691,9 @@ def get_table_index(
X = 1.0 if abs(t.x0 - t.x1) == 0.0 else abs(t.x0 - t.x1) X = 1.0 if abs(t.x0 - t.x1) == 0.0 else abs(t.x0 - t.x1)
Y = 1.0 if abs(t.y0 - t.y1) == 0.0 else abs(t.y0 - t.y1) Y = 1.0 if abs(t.y0 - t.y1) == 0.0 else abs(t.y0 - t.y1)
charea = X * Y charea = X * Y
error = ((X * (y0_offset + y1_offset)) + (Y * (x0_offset + x1_offset))) / charea error = (
(X * (y0_offset + y1_offset)) + (Y * (x0_offset + x1_offset))
) / charea
if split_text: if split_text:
return ( return (
@ -697,13 +709,16 @@ def get_table_index(
( (
r_idx, r_idx,
c_idx, c_idx,
flag_font_size(t._objs, direction, strip_text=strip_text), flag_font_size(t._objs,
direction,
strip_text=strip_text),
) )
], ],
error, error,
) )
else: else:
return [(r_idx, c_idx, text_strip(t.get_text(), strip_text))], error return [(r_idx, c_idx, text_strip(t.get_text(), strip_text))], \
error
def compute_accuracy(error_weights): def compute_accuracy(error_weights):
@ -751,7 +766,6 @@ def compute_whitespace(d):
""" """
whitespace = 0 whitespace = 0
r_nempty_cells, c_nempty_cells = [], []
for i in d: for i in d:
for j in i: for j in i:
if j.strip() == "": if j.strip() == "":
@ -811,6 +825,7 @@ def get_page_layout(
width = layout.bbox[2] width = layout.bbox[2]
height = layout.bbox[3] height = layout.bbox[3]
dim = (width, height) dim = (width, height)
break # we assume a single page pdf
return layout, dim return layout, dim

View File

@ -13,7 +13,7 @@ The easiest way to install Camelot is to install it with `conda`_, which is a pa
$ conda install -c conda-forge camelot-py $ conda install -c conda-forge camelot-py
.. note:: Camelot is available for Python 2.7, 3.5, 3.6 and 3.7 on Linux, macOS and Windows. For Windows, you will need to install ghostscript which you can get from their `downloads page`_. .. note:: Camelot is available for Python 3.5, 3.6 and 3.7 on Linux, macOS and Windows. For Windows, you will need to install ghostscript which you can get from their `downloads page`_.
.. _conda: https://conda.io/docs/ .. _conda: https://conda.io/docs/
.. _Anaconda: http://docs.continuum.io/anaconda/ .. _Anaconda: http://docs.continuum.io/anaconda/

View File

@ -4,5 +4,5 @@ numpy>=1.13.3
opencv-python>=3.4.2.17 opencv-python>=3.4.2.17
openpyxl>=2.5.8 openpyxl>=2.5.8
pandas>=0.23.4 pandas>=0.23.4
pdfminer.six>=20170720 pdfminer.six>=20200402
PyPDF2>=1.26.0 PyPDF2>=1.26.0

View File

@ -19,7 +19,7 @@ requires = [
'numpy>=1.13.3', 'numpy>=1.13.3',
'openpyxl>=2.5.8', 'openpyxl>=2.5.8',
'pandas>=0.23.4', 'pandas>=0.23.4',
'pdfminer.six>=20170720', 'pdfminer.six>=20200402',
'PyPDF2>=1.26.0' 'PyPDF2>=1.26.0'
] ]
@ -69,9 +69,8 @@ def setup_package():
}, },
classifiers=[ classifiers=[
# Trove classifiers # Trove classifiers
# Full list: https://pypi.python.org/pypi?%3Aaction=list_classifiers # Full list: https://pypi.python.org/pypi?%3Aaction=list_classifiers # noqa
'License :: OSI Approved :: MIT License', 'License :: OSI Approved :: MIT License',
'Programming Language :: Python :: 2.7',
'Programming Language :: Python :: 3.5', 'Programming Language :: Python :: 3.5',
'Programming Language :: Python :: 3.6', 'Programming Language :: Python :: 3.6',
'Programming Language :: Python :: 3.7' 'Programming Language :: Python :: 3.7'

View File

@ -4,16 +4,6 @@ from __future__ import unicode_literals
data_stream = [ data_stream = [
[
"",
"Table: 5 Public Health Outlay 2012-13 (Budget Estimates) (Rs. in 000)",
"",
"",
"",
"",
"",
"",
],
["States-A", "Revenue", "", "Capital", "", "Total", "Others(1)", "Total"], ["States-A", "Revenue", "", "Capital", "", "Total", "Others(1)", "Total"],
["", "", "", "", "", "Revenue &", "", ""], ["", "", "", "", "", "Revenue &", "", ""],
["", "Medical &", "Family", "Medical &", "Family", "", "", ""], ["", "Medical &", "Family", "Medical &", "Family", "", "", ""],
@ -80,7 +70,8 @@ data_stream = [
"5,000", "5,000",
"33,051,480", "33,051,480",
], ],
["Goa", "4,055,567", "110,000", "330,053", "0", "4,495,620", "12,560", "4,508,180"], ["Goa", "4,055,567", "110,000", "330,053", "0", "4,495,620", "12,560",
"4,508,180"],
[ [
"Gujarat", "Gujarat",
"26,328,400", "26,328,400",
@ -171,7 +162,8 @@ data_stream = [
"313,762", "313,762",
"67,044,159", "67,044,159",
], ],
["Manipur", "2,494,600", "187,700", "897,400", "0", "3,579,700", "0", "3,579,700"], ["Manipur", "2,494,600", "187,700", "897,400", "0", "3,579,700",
"0", "3,579,700"],
[ [
"Meghalaya", "Meghalaya",
"2,894,093", "2,894,093",
@ -236,7 +228,8 @@ data_stream = [
data_stream_table_rotated = [ data_stream_table_rotated = [
[ [
"Table 21 Current use of contraception by background characteristics\u2014Continued", "Table 21 Current use of contraception by background characteristics"
"\u2014Continued",
"", "",
"", "",
"", "",
@ -330,7 +323,8 @@ data_stream_table_rotated = [
"Total", "Total",
"women", "women",
], ],
["Caste/tribe", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", ""], ["Caste/tribe", "", "", "", "", "", "", "", "", "", "", "", "", "", "",
"", ""],
[ [
"Scheduled caste", "Scheduled caste",
"74.8", "74.8",
@ -407,7 +401,8 @@ data_stream_table_rotated = [
"100.0", "100.0",
"3,319", "3,319",
], ],
["Wealth index", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", ""], ["Wealth index", "", "", "", "", "", "", "", "", "", "", "", "",
"", "", "", ""],
[ [
"Lowest", "Lowest",
"64.5", "64.5",
@ -830,7 +825,8 @@ data_stream_table_rotated = [
data_stream_two_tables_1 = [ data_stream_two_tables_1 = [
[ [
"[In thousands (11,062.6 represents 11,062,600) For year ending December 31. Based on Uniform Crime Reporting (UCR)", "Program. Represents arrests reported (not charged) by 12,910 "
"agencies with a total population of 247,526,916 as estimated",
"", "",
"", "",
"", "",
@ -842,7 +838,8 @@ data_stream_two_tables_1 = [
"", "",
], ],
[ [
"Program. Represents arrests reported (not charged) by 12,910 agencies with a total population of 247,526,916 as estimated", "by the FBI. Some persons may be arrested more than once during a "
"year, therefore, the data in this table, in some cases,",
"", "",
"", "",
"", "",
@ -854,19 +851,8 @@ data_stream_two_tables_1 = [
"", "",
], ],
[ [
"by the FBI. Some persons may be arrested more than once during a year, therefore, the data in this table, in some cases,", "could represent multiple arrests of the same person. See text, "
"", "this section and source]",
"",
"",
"",
"",
"",
"",
"",
"",
],
[
"could represent multiple arrests of the same person. See text, this section and source]",
"", "",
"", "",
"", "",
@ -903,7 +889,8 @@ data_stream_two_tables_1 = [
"and over", "and over",
], ],
[ [
"Total .\n .\n . . . . . .\n . .\n . .\n . .\n . .\n . .\n . .\n . .\n . . .", "Total .\n .\n . . . . . .\n . .\n . .\n . .\n . .\n . "
".\n . .\n . .\n . . .",
"11,062 .6", "11,062 .6",
"1,540 .0", "1,540 .0",
"9,522 .6", "9,522 .6",
@ -915,7 +902,8 @@ data_stream_two_tables_1 = [
"2,330 .9", "2,330 .9",
], ],
[ [
"Violent crime . . . . . . . .\n . .\n . .\n . .\n . .\n . .", "Violent crime . . . . . . . .\n . .\n . .\n . .\n . "
".\n . .",
"467 .9", "467 .9",
"69 .1", "69 .1",
"398 .8", "398 .8",
@ -976,7 +964,8 @@ data_stream_two_tables_1 = [
"64.5", "64.5",
], ],
[ [
"Property crime . . . .\n . .\n . . .\n . . .\n .\n . . . .", "Property crime . . . .\n . .\n . . .\n . . .\n .\n . . "
". .",
"1,396 .4", "1,396 .4",
"338 .7", "338 .7",
"1,057 .7", "1,057 .7",
@ -1060,7 +1049,8 @@ data_stream_two_tables_1 = [
"25.5", "25.5",
], ],
[ [
"Fraud .\n.\n.\n. .\n. . . .\n. .\n.\n. .\n.\n.\n. .\n.\n.\n. .\n.\n.\n.", "Fraud .\n.\n.\n. .\n. . . .\n. .\n.\n. .\n.\n.\n. .\n.\n.\n. "
".\n.\n.\n.",
"173.7", "173.7",
"5.1", "5.1",
"168.5", "168.5",
@ -1290,19 +1280,8 @@ data_stream_two_tables_1 = [
], ],
[ [
"", "",
" Represents zero. X Not applicable. 1 Buying, receiving, possessing stolen property. 2 Except forcible rape and prostitution.", " Represents zero. X Not applicable. 1 Buying, receiving, "
"", "possessing stolen property. 2 Except forcible rape and prostitution.",
"",
"",
"",
"",
"",
"",
"",
],
[
"",
"Source: U.S. Department of Justice, Federal Bureau of Investigation, Uniform Crime Reports, Arrests Master Files.",
"", "",
"", "",
"", "",
@ -1315,17 +1294,10 @@ data_stream_two_tables_1 = [
] ]
data_stream_two_tables_2 = [ data_stream_two_tables_2 = [
[
"",
"Source: U.S. Department of Justice, Federal Bureau of Investigation, Uniform Crime Reports, Arrests Master Files.",
"",
"",
"",
"",
],
["Table 325. Arrests by Race: 2009", "", "", "", "", ""], ["Table 325. Arrests by Race: 2009", "", "", "", "", ""],
[ [
"[Based on Uniform Crime Reporting (UCR) Program. Represents arrests reported (not charged) by 12,371 agencies", "[Based on Uniform Crime Reporting (UCR) Program. Represents "
"arrests reported (not charged) by 12,371 agencies",
"", "",
"", "",
"", "",
@ -1333,7 +1305,8 @@ data_stream_two_tables_2 = [
"", "",
], ],
[ [
"with a total population of 239,839,971 as estimated by the FBI. See headnote, Table 324]", "with a total population of 239,839,971 as estimated by the FBI. "
"See headnote, Table 324]",
"", "",
"", "",
"", "",
@ -1344,7 +1317,8 @@ data_stream_two_tables_2 = [
["Offense charged", "", "", "", "Indian/Alaskan", "Asian Pacific"], ["Offense charged", "", "", "", "Indian/Alaskan", "Asian Pacific"],
["", "Total", "White", "Black", "Native", "Islander"], ["", "Total", "White", "Black", "Native", "Islander"],
[ [
"Total .\n .\n .\n .\n . .\n . . .\n . . .\n .\n . . .\n .\n . . .\n . .\n .\n . . .\n .\n .\n .\n . .\n . .\n . .", "Total .\n .\n .\n .\n . .\n . . .\n . . .\n .\n . . .\n "
".\n . . .\n . .\n .\n . . .\n .\n .\n .\n . .\n . .\n . .",
"10,690,561", "10,690,561",
"7,389,208", "7,389,208",
"3,027,153", "3,027,153",
@ -1352,7 +1326,8 @@ data_stream_two_tables_2 = [
"123,656", "123,656",
], ],
[ [
"Violent crime . . . . . . . .\n . .\n . .\n . .\n . .\n .\n .\n . .\n . .\n .\n .\n .\n .\n . .", "Violent crime . . . . . . . .\n . .\n . .\n . .\n . "
".\n .\n .\n . .\n . .\n .\n .\n .\n .\n . .",
"456,965", "456,965",
"268,346", "268,346",
"177,766", "177,766",
@ -1368,7 +1343,8 @@ data_stream_two_tables_2 = [
"97", "97",
], ],
[ [
"Forcible rape . . . . . . . .\n. .\n. .\n. .\n. .\n.\n.\n. .\n. .\n.\n.\n.\n.\n. .", "Forcible rape . . . . . . . .\n. .\n. .\n. .\n. .\n.\n.\n. .\n. "
".\n.\n.\n.\n.\n. .",
"16,362", "16,362",
"10,644", "10,644",
"5,319", "5,319",
@ -1376,7 +1352,8 @@ data_stream_two_tables_2 = [
"230", "230",
], ],
[ [
"Robbery . . . . .\n. . . . .\n.\n. . .\n.\n. . .\n.\n.\n. .\n.\n.\n. .\n.\n.\n. . . .", "Robbery . . . . .\n. . . . .\n.\n. . .\n.\n. . .\n.\n.\n. "
".\n.\n.\n. .\n.\n.\n. . . .",
"100,496", "100,496",
"43,039", "43,039",
"55,742", "55,742",
@ -1384,7 +1361,8 @@ data_stream_two_tables_2 = [
"989", "989",
], ],
[ [
"Aggravated assault . . . . . . . .\n. .\n. .\n.\n.\n.\n.\n. .\n. .\n.\n.\n.", "Aggravated assault . . . . . . . .\n. .\n. .\n.\n.\n.\n.\n. .\n. "
".\n.\n.\n.",
"330,368", "330,368",
"209,922", "209,922",
"111,904", "111,904",
@ -1392,7 +1370,8 @@ data_stream_two_tables_2 = [
"3,929", "3,929",
], ],
[ [
"Property crime . . . . .\n . . . . .\n .\n . . .\n .\n . .\n .\n .\n .\n . .\n .\n . .\n .\n .", "Property crime . . . . .\n . . . . .\n .\n . . .\n .\n "
". .\n .\n .\n .\n . .\n .\n . .\n .\n .",
"1,364,409", "1,364,409",
"922,139", "922,139",
"406,382", "406,382",
@ -1400,7 +1379,8 @@ data_stream_two_tables_2 = [
"18,289", "18,289",
], ],
[ [
"Burglary . . .\n. . . . .\n. . .\n.\n.\n. .\n.\n.\n. .\n.\n.\n. .\n.\n.\n. .\n.\n. . . .", "Burglary . . .\n. . . . .\n. . .\n.\n.\n. .\n.\n.\n. .\n.\n.\n. "
".\n.\n.\n. .\n.\n. . . .",
"234,551", "234,551",
"155,994", "155,994",
"74,419", "74,419",
@ -1408,7 +1388,8 @@ data_stream_two_tables_2 = [
"2,117", "2,117",
], ],
[ [
"Larceny-theft . . . . . . . .\n. .\n. .\n. .\n. .\n.\n.\n. .\n. .\n.\n.\n.\n.\n. .", "Larceny-theft . . . . . . . .\n. .\n. .\n. .\n. .\n.\n.\n. .\n. "
".\n.\n.\n.\n.\n. .",
"1,056,473", "1,056,473",
"719,983", "719,983",
"306,625", "306,625",
@ -1416,7 +1397,8 @@ data_stream_two_tables_2 = [
"15,219", "15,219",
], ],
[ [
"Motor vehicle theft . . . . . .\n. .\n.\n. . .\n.\n. .\n.\n.\n.\n. .\n.\n. .\n.", "Motor vehicle theft . . . . . .\n. .\n.\n. . .\n.\n. .\n.\n.\n.\n. "
".\n.\n. .\n.",
"63,919", "63,919",
"39,077", "39,077",
"23,184", "23,184",
@ -1424,7 +1406,8 @@ data_stream_two_tables_2 = [
"841", "841",
], ],
[ [
"Arson .\n. . . .\n. .\n. .\n. .\n.\n. .\n.\n.\n. .\n.\n.\n. .\n.\n.\n. .\n.\n.\n. . . . . .", "Arson .\n. . . .\n. .\n. .\n. .\n.\n. .\n.\n.\n. .\n.\n.\n. "
".\n.\n.\n. .\n.\n.\n. . . . . .",
"9,466", "9,466",
"7,085", "7,085",
"2,154", "2,154",
@ -1432,7 +1415,8 @@ data_stream_two_tables_2 = [
"112", "112",
], ],
[ [
"Other assaults .\n. . . . . . .\n.\n. . .\n.\n. . .\n.\n. .\n.\n.\n.\n. .\n.\n. .\n.", "Other assaults .\n. . . . . . .\n.\n. . .\n.\n. . .\n.\n. "
".\n.\n.\n.\n. .\n.\n. .\n.",
"1,032,502", "1,032,502",
"672,865", "672,865",
"332,435", "332,435",
@ -1440,7 +1424,8 @@ data_stream_two_tables_2 = [
"12,075", "12,075",
], ],
[ [
"Forgery and counterfeiting .\n. . . . . . .\n.\n. .\n.\n.\n.\n. .\n. .\n.", "Forgery and counterfeiting .\n. . . . . . .\n.\n. .\n.\n.\n.\n. "
".\n. .\n.",
"67,054", "67,054",
"44,730", "44,730",
"21,251", "21,251",
@ -1448,7 +1433,8 @@ data_stream_two_tables_2 = [
"728", "728",
], ],
[ [
"Fraud .\n.\n. . . . . .\n. .\n. .\n. .\n. .\n. .\n. .\n. .\n. .\n. .\n.\n.\n. . . . . . .", "Fraud .\n.\n. . . . . .\n. .\n. .\n. .\n. .\n. .\n. .\n. .\n. .\n. "
".\n.\n.\n. . . . . . .",
"161,233", "161,233",
"108,032", "108,032",
"50,367", "50,367",
@ -1456,7 +1442,8 @@ data_stream_two_tables_2 = [
"1,519", "1,519",
], ],
[ [
"Embezzlement . . . .\n. . . . .\n.\n. . .\n.\n. . .\n.\n.\n. .\n.\n. .\n.\n.\n.\n.", "Embezzlement . . . .\n. . . . .\n.\n. . .\n.\n. . .\n.\n.\n. "
".\n.\n. .\n.\n.\n.\n.",
"13,960", "13,960",
"9,208", "9,208",
"4,429", "4,429",
@ -1472,7 +1459,8 @@ data_stream_two_tables_2 = [
"742", "742",
], ],
[ [
"Vandalism . . . . . . . .\n. .\n. .\n. .\n. .\n. .\n. .\n.\n.\n. .\n. .\n.\n.\n.\n. .", "Vandalism . . . . . . . .\n. .\n. .\n. .\n. .\n. .\n. .\n.\n.\n. "
".\n. .\n.\n.\n.\n. .",
"212,173", "212,173",
"157,723", "157,723",
"48,746", "48,746",
@ -1496,7 +1484,8 @@ data_stream_two_tables_2 = [
"1,413", "1,413",
], ],
[ [
"Sex offenses 1 . . . . . . . .\n. .\n. .\n. .\n. .\n.\n.\n. .\n. .\n.\n.\n.\n.\n. .", "Sex offenses 1 . . . . . . . .\n. .\n. .\n. .\n. .\n.\n.\n. .\n. "
".\n.\n.\n.\n.\n. .",
"60,175", "60,175",
"44,240", "44,240",
"14,347", "14,347",
@ -1504,7 +1493,8 @@ data_stream_two_tables_2 = [
"873", "873",
], ],
[ [
"Drug abuse violations . . . . . . . .\n. . .\n.\n.\n.\n. .\n. .\n.\n.\n.\n.", "Drug abuse violations . . . . . . . .\n. . .\n.\n.\n.\n. .\n. "
".\n.\n.\n.\n.",
"1,301,629", "1,301,629",
"845,974", "845,974",
"437,623", "437,623",
@ -1512,7 +1502,8 @@ data_stream_two_tables_2 = [
"9,444", "9,444",
], ],
[ [
"Gambling . . . . .\n. . . . .\n.\n. . .\n.\n. . .\n. .\n.\n. . .\n.\n.\n.\n.\n. .\n. .", "Gambling . . . . .\n. . . . .\n.\n. . .\n.\n. . .\n. .\n.\n. . "
".\n.\n.\n.\n.\n. .\n. .",
"8,046", "8,046",
"2,290", "2,290",
"5,518", "5,518",
@ -1528,7 +1519,8 @@ data_stream_two_tables_2 = [
"624", "624",
], ],
[ [
"Driving under the influence . . . . . . .\n. .\n.\n. .\n.\n.\n.\n.\n. .", "Driving under the influence . . . . . . .\n. .\n.\n. "
".\n.\n.\n.\n.\n. .",
"1,105,401", "1,105,401",
"954,444", "954,444",
"121,594", "121,594",
@ -1536,7 +1528,8 @@ data_stream_two_tables_2 = [
"14,460", "14,460",
], ],
[ [
"Liquor laws . . . . . . . .\n. .\n. .\n. .\n. .\n. . .\n.\n.\n.\n. .\n. .\n.\n.\n.\n.", "Liquor laws . . . . . . . .\n. .\n. .\n. .\n. .\n. . "
".\n.\n.\n.\n. .\n. .\n.\n.\n.\n.",
"444,087", "444,087",
"373,189", "373,189",
"50,431", "50,431",
@ -1544,7 +1537,8 @@ data_stream_two_tables_2 = [
"5,591", "5,591",
], ],
[ [
"Drunkenness . .\n. . . . . . .\n.\n. . .\n.\n. . .\n.\n.\n.\n. . .\n.\n.\n.\n.\n.\n.", "Drunkenness . .\n. . . . . . .\n.\n. . .\n.\n. . .\n.\n.\n.\n. . "
".\n.\n.\n.\n.\n.\n.",
"469,958", "469,958",
"387,542", "387,542",
"71,020", "71,020",
@ -1552,7 +1546,8 @@ data_stream_two_tables_2 = [
"2,844", "2,844",
], ],
[ [
"Disorderly conduct . . .\n. . . . . .\n. .\n. . .\n.\n.\n.\n. .\n. .\n.\n.\n.\n.", "Disorderly conduct . . .\n. . . . . .\n. .\n. . .\n.\n.\n.\n. .\n. "
".\n.\n.\n.\n.",
"515,689", "515,689",
"326,563", "326,563",
"176,169", "176,169",
@ -1560,7 +1555,8 @@ data_stream_two_tables_2 = [
"4,174", "4,174",
], ],
[ [
"Vagrancy . . .\n. .\n. . . .\n. .\n.\n. .\n.\n.\n. .\n.\n.\n. .\n.\n.\n. .\n.\n.\n. . . .", "Vagrancy . . .\n. .\n. . . .\n. .\n.\n. .\n.\n.\n. .\n.\n.\n. "
".\n.\n.\n. .\n.\n.\n. . . .",
"26,347", "26,347",
"14,581", "14,581",
"11,031", "11,031",
@ -1568,7 +1564,8 @@ data_stream_two_tables_2 = [
"192", "192",
], ],
[ [
"All other offenses (except traffic) . .\n. .\n. .\n. .\n.\n.\n.\n. .\n.", "All other offenses (except traffic) . .\n. .\n. .\n. .\n.\n.\n.\n. "
".\n.",
"2,929,217", "2,929,217",
"1,937,221", "1,937,221",
"911,670", "911,670",
@ -1576,7 +1573,8 @@ data_stream_two_tables_2 = [
"36,446", "36,446",
], ],
[ [
"Suspicion . . .\n. . . . .\n. .\n. .\n. .\n. .\n. .\n. .\n. .\n.\n.\n.\n.\n. .\n. . . .", "Suspicion . . .\n. . . . .\n. .\n. .\n. .\n. .\n. .\n. .\n. "
".\n.\n.\n.\n.\n. .\n. . . .",
"1,513", "1,513",
"677", "677",
"828", "828",
@ -1592,7 +1590,8 @@ data_stream_two_tables_2 = [
"1,060", "1,060",
], ],
[ [
"Runaways . . . . . . . .\n. .\n. .\n. .\n. .\n. .\n. .\n.\n.\n. .\n. .\n.\n.\n.\n. .", "Runaways . . . . . . . .\n. .\n. .\n. .\n. .\n. .\n. .\n.\n.\n. "
".\n. .\n.\n.\n.\n. .",
"73,616", "73,616",
"48,343", "48,343",
"19,670", "19,670",
@ -1600,14 +1599,6 @@ data_stream_two_tables_2 = [
"3,950", "3,950",
], ],
["1 Except forcible rape and prostitution.", "", "", "", "", ""], ["1 Except forcible rape and prostitution.", "", "", "", "", ""],
[
"",
"Source: U.S. Department of Justice, Federal Bureau of Investigation, “Crime in the United States, Arrests,” September 2010,",
"",
"",
"",
"",
],
] ]
data_stream_table_areas = [ data_stream_table_areas = [
@ -1634,10 +1625,12 @@ data_stream_columns = [
"Nombre Localidad", "Nombre Localidad",
], ],
["Entidad", "", "Municipio", "", "Localidad", ""], ["Entidad", "", "Municipio", "", "Localidad", ""],
["01", "Aguascalientes", "001", "Aguascalientes", "0094", "Granja Adelita"], ["01", "Aguascalientes", "001", "Aguascalientes", "0094",
"Granja Adelita"],
["01", "Aguascalientes", "001", "Aguascalientes", "0096", "Agua Azul"], ["01", "Aguascalientes", "001", "Aguascalientes", "0096", "Agua Azul"],
["01", "Aguascalientes", "001", "Aguascalientes", "0100", "Rancho Alegre"], ["01", "Aguascalientes", "001", "Aguascalientes", "0100", "Rancho Alegre"],
["01", "Aguascalientes", "001", "Aguascalientes", "0102", "Los Arbolitos [Rancho]"], ["01", "Aguascalientes", "001", "Aguascalientes", "0102",
"Los Arbolitos [Rancho]"],
[ [
"01", "01",
"Aguascalientes", "Aguascalientes",
@ -1655,7 +1648,8 @@ data_stream_columns = [
"0112", "0112",
"Baj\xedo los V\xe1zquez", "Baj\xedo los V\xe1zquez",
], ],
["01", "Aguascalientes", "001", "Aguascalientes", "0113", "Baj\xedo de Montoro"], ["01", "Aguascalientes", "001", "Aguascalientes", "0113",
"Baj\xedo de Montoro"],
[ [
"01", "01",
"Aguascalientes", "Aguascalientes",
@ -1697,8 +1691,10 @@ data_stream_columns = [
"Ca\xf1ada Honda [Estaci\xf3n]", "Ca\xf1ada Honda [Estaci\xf3n]",
], ],
["01", "Aguascalientes", "001", "Aguascalientes", "0127", "Los Ca\xf1os"], ["01", "Aguascalientes", "001", "Aguascalientes", "0127", "Los Ca\xf1os"],
["01", "Aguascalientes", "001", "Aguascalientes", "0128", "El Cari\xf1\xe1n"], ["01", "Aguascalientes", "001", "Aguascalientes", "0128",
["01", "Aguascalientes", "001", "Aguascalientes", "0129", "El Carmen [Granja]"], "El Cari\xf1\xe1n"],
["01", "Aguascalientes", "001", "Aguascalientes", "0129",
"El Carmen [Granja]"],
[ [
"01", "01",
"Aguascalientes", "Aguascalientes",
@ -1733,9 +1729,11 @@ data_stream_columns = [
"El Colorado (El Soyatal)", "El Colorado (El Soyatal)",
], ],
["01", "Aguascalientes", "001", "Aguascalientes", "0146", "El Conejal"], ["01", "Aguascalientes", "001", "Aguascalientes", "0146", "El Conejal"],
["01", "Aguascalientes", "001", "Aguascalientes", "0157", "Cotorina de Abajo"], ["01", "Aguascalientes", "001", "Aguascalientes", "0157",
"Cotorina de Abajo"],
["01", "Aguascalientes", "001", "Aguascalientes", "0162", "Coyotes"], ["01", "Aguascalientes", "001", "Aguascalientes", "0162", "Coyotes"],
["01", "Aguascalientes", "001", "Aguascalientes", "0166", "La Huerta (La Cruz)"], ["01", "Aguascalientes", "001", "Aguascalientes", "0166",
"La Huerta (La Cruz)"],
[ [
"01", "01",
"Aguascalientes", "Aguascalientes",
@ -1752,17 +1750,20 @@ data_stream_columns = [
"0171", "0171",
"Los Cuervos (Los Ojos de Agua)", "Los Cuervos (Los Ojos de Agua)",
], ],
["01", "Aguascalientes", "001", "Aguascalientes", "0172", "San Jos\xe9 [Granja]"], ["01", "Aguascalientes", "001", "Aguascalientes", "0172",
"San Jos\xe9 [Granja]"],
["01", "Aguascalientes", "001", "Aguascalientes", "0176", "La Chiripa"], ["01", "Aguascalientes", "001", "Aguascalientes", "0176", "La Chiripa"],
["01", "Aguascalientes", "001", "Aguascalientes", "0182", "Dolores"], ["01", "Aguascalientes", "001", "Aguascalientes", "0182", "Dolores"],
["01", "Aguascalientes", "001", "Aguascalientes", "0183", "Los Dolores"], ["01", "Aguascalientes", "001", "Aguascalientes", "0183", "Los Dolores"],
["01", "Aguascalientes", "001", "Aguascalientes", "0190", "El Duraznillo"], ["01", "Aguascalientes", "001", "Aguascalientes", "0190", "El Duraznillo"],
["01", "Aguascalientes", "001", "Aguascalientes", "0191", "Los Dur\xf3n"], ["01", "Aguascalientes", "001", "Aguascalientes", "0191", "Los Dur\xf3n"],
["01", "Aguascalientes", "001", "Aguascalientes", "0197", "La Escondida"], ["01", "Aguascalientes", "001", "Aguascalientes", "0197", "La Escondida"],
["01", "Aguascalientes", "001", "Aguascalientes", "0201", "Brande Vin [Bodegas]"], ["01", "Aguascalientes", "001", "Aguascalientes", "0201",
"Brande Vin [Bodegas]"],
["01", "Aguascalientes", "001", "Aguascalientes", "0207", "Valle Redondo"], ["01", "Aguascalientes", "001", "Aguascalientes", "0207", "Valle Redondo"],
["01", "Aguascalientes", "001", "Aguascalientes", "0209", "La Fortuna"], ["01", "Aguascalientes", "001", "Aguascalientes", "0209", "La Fortuna"],
["01", "Aguascalientes", "001", "Aguascalientes", "0212", "Lomas del Gachup\xedn"], ["01", "Aguascalientes", "001", "Aguascalientes", "0212",
"Lomas del Gachup\xedn"],
[ [
"01", "01",
"Aguascalientes", "Aguascalientes",
@ -1772,22 +1773,12 @@ data_stream_columns = [
"El Carmen (Gallinas G\xfceras) [Rancho]", "El Carmen (Gallinas G\xfceras) [Rancho]",
], ],
["01", "Aguascalientes", "001", "Aguascalientes", "0216", "La Gloria"], ["01", "Aguascalientes", "001", "Aguascalientes", "0216", "La Gloria"],
["01", "Aguascalientes", "001", "Aguascalientes", "0226", "Hacienda Nueva"], ["01", "Aguascalientes", "001", "Aguascalientes", "0226",
"Hacienda Nueva"],
] ]
data_stream_split_text = [ data_stream_split_text = [
[ ["FEB", "RUAR", "Y 2014 M27 (BUS)", "", "", "", "", "", "", ""],
"FEB",
"RUAR",
"Y 2014 M27 (BUS)",
"",
"ALPHABETIC LISTING BY T",
"YPE",
"",
"",
"",
"ABLPDM27",
],
["", "", "", "", "OF ACTIVE LICENSES", "", "", "", "", "3/19/2014"], ["", "", "", "", "OF ACTIVE LICENSES", "", "", "", "", "3/19/2014"],
["", "", "", "", "OKLAHOMA ABLE COMMIS", "SION", "", "", "", ""], ["", "", "", "", "OKLAHOMA ABLE COMMIS", "SION", "", "", "", ""],
["LICENSE", "", "", "", "PREMISE", "", "", "", "", ""], ["LICENSE", "", "", "", "PREMISE", "", "", "", "", ""],
@ -1977,7 +1968,18 @@ data_stream_split_text = [
"(872) 825-8309", "(872) 825-8309",
"2014/04/11", "2014/04/11",
], ],
["", "", "A SENSU JAPANESE", "", "7123 SOUTH 92ND EAST", "", "", "", "", ""], [
"",
"",
"A SENSU JAPANESE",
"",
"7123 SOUTH 92ND EAST",
"",
"",
"",
"",
"",
],
[ [
"625422", "625422",
"BAW", "BAW",
@ -2029,7 +2031,18 @@ data_stream_split_text = [
"(580) 928-2700", "(580) 928-2700",
"2014/09/08", "2014/09/08",
], ],
["", "", "ANDOLINI'S PIZZERIA &", "", "12140 EAST 96TH STREET", "", "", "", "", ""], [
"",
"",
"ANDOLINI'S PIZZERIA &",
"",
"12140 EAST 96TH STREET",
"",
"",
"",
"",
"",
],
[ [
"428377", "428377",
"BAW", "BAW",
@ -2148,7 +2161,8 @@ data_stream_flag_size = [
"from SBI", "from SBI",
"from", "from",
], ],
["", "Debt", "", "", "RBI", "Banks", "LIC", "GIC", "NABARD", "& Other", "NCDC"], ["", "Debt", "", "", "RBI", "Banks", "LIC", "GIC", "NABARD", "& Other",
"NCDC"],
["", "", "", "", "", "& FIs", "", "", "", "Banks", ""], ["", "", "", "", "", "& FIs", "", "", "", "Banks", ""],
["1", "2=", "3", "4", "5", "6=", "7", "8", "9", "10", "11"], ["1", "2=", "3", "4", "5", "6=", "7", "8", "9", "10", "11"],
["", "(3 to 6)+14", "", "", "", "(7 to13)", "", "", "", "", ""], ["", "(3 to 6)+14", "", "", "", "(7 to13)", "", "", "", "", ""],
@ -2165,7 +2179,8 @@ data_stream_flag_size = [
"-", "-",
"0.25", "0.25",
], ],
["Arunachal Pradesh", "1.23", "1.1", "-", "-", "0.13", "-", "-", "-", "-", "-"], ["Arunachal Pradesh", "1.23", "1.1", "-", "-", "0.13", "-", "-", "-",
"-", "-"],
[ [
"Assam", "Assam",
"12.69", "12.69",
@ -2194,8 +2209,10 @@ data_stream_flag_size = [
], ],
["Chhattisgarh", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-"], ["Chhattisgarh", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-"],
["Goa", "1.4", "1.02", "-", "-", "0.38", "0.31", "-", "0.07", "-", "-"], ["Goa", "1.4", "1.02", "-", "-", "0.38", "0.31", "-", "0.07", "-", "-"],
["Gujarat", "19.75", "17.1", "-", "-", "2.64", "1.17", "-", "1.11", "-", "0.44"], ["Gujarat", "19.75", "17.1", "-", "-", "2.64", "1.17", "-", "1.11",
["Haryana", "11.53", "9.67", "-", "0.06", "1.8", "0.55", "-", "0.64", "-", "0.49"], "-", "0.44"],
["Haryana", "11.53", "9.67", "-", "0.06", "1.8", "0.55", "-", "0.64",
"-", "0.49"],
[ [
"Himachal Pradesh", "Himachal Pradesh",
"8.02", "8.02",
@ -2223,7 +2240,8 @@ data_stream_flag_size = [
"-", "-",
], ],
["Jharkhand", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-"], ["Jharkhand", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-"],
["Karnataka", "22.44", "19.59", "-", "-", "2.86", "1.22", "-", "0.89", "-", "0.69"], ["Karnataka", "22.44", "19.59", "-", "-", "2.86", "1.22", "-", "0.89",
"-", "0.69"],
[ [
"Kerala", "Kerala",
"29.03", "29.03",
@ -2263,11 +2281,16 @@ data_stream_flag_size = [
"0.02", "0.02",
"2.89", "2.89",
], ],
["Manipur", "2.17", "1.61", "-", "0.26", "0.29", "0.08", "-", "-", "-", "0.09"], ["Manipur", "2.17", "1.61", "-", "0.26", "0.29", "0.08", "-", "-", "-",
["Meghalaya", "1.36", "1.38", "-", "-", "-0.02", "0.04", "-", "-0.05", "-", "0.03"], "0.09"],
["Mizoram", "1.17", "0.46", "-", "0.27", "0.43", "0.11", "-", "-", "-", "0.03"], ["Meghalaya", "1.36", "1.38", "-", "-", "-0.02", "0.04", "-", "-0.05",
["Nagaland", "2.99", "2.6", "-", "-", "0.39", "0.24", "-", "-", "-", "0.04"], "-", "0.03"],
["Odisha", "34.04", "27.58", "-", "4.4", "2.06", "0.56", "-", "0.66", "-", "0.2"], ["Mizoram", "1.17", "0.46", "-", "0.27", "0.43", "0.11", "-", "-",
"-", "0.03"],
["Nagaland", "2.99", "2.6", "-", "-", "0.39", "0.24", "-", "-", "-",
"0.04"],
["Odisha", "34.04", "27.58", "-", "4.4", "2.06", "0.56", "-", "0.66",
"-", "0.2"],
[ [
"Punjab", "Punjab",
"19.18", "19.18",
@ -2295,8 +2318,10 @@ data_stream_flag_size = [
"0.81", "0.81",
], ],
["Sikkim", "0.16", "-", "-", "-", "0.16", "0.03", "-", "-", "-", "0.01"], ["Sikkim", "0.16", "-", "-", "-", "0.16", "0.03", "-", "-", "-", "0.01"],
["Tamil Nadu", "34.11", "31.41", "-", "-", "2.7", "1.3", "-", "0.6", "-", "0.68"], ["Tamil Nadu", "34.11", "31.41", "-", "-", "2.7", "1.3", "-", "0.6", "-",
["Tripura", "2.3", "1.89", "-", "-", "0.41", "0.41", "-", "-0.05", "-", "0.02"], "0.68"],
["Tripura", "2.3", "1.89", "-", "-", "0.41", "0.41", "-", "-0.05", "-",
"0.02"],
["Uttaranchal", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-"], ["Uttaranchal", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-"],
[ [
"Uttar Pradesh", "Uttar Pradesh",
@ -2393,11 +2418,13 @@ data_stream_edge_tol = [
["Costs", "(0.21)"], ["Costs", "(0.21)"],
["T\notal investment result per unit", "3.78"], ["T\notal investment result per unit", "3.78"],
[ [
"1 The results cover the period from inception of the Fund at 8 April 2016 through 31 December 2016.", "1 The results cover the period from inception of the Fund at "
"8 April 2016 through 31 December 2016.",
"", "",
], ],
[ [
"2 The result per unit is calculated using the total number of outstanding unit as per the end of the", "2 The result per unit is calculated using the total number of "
"outstanding unit as per the end of the",
"", "",
], ],
["period.", ""], ["period.", ""],
@ -2454,7 +2481,8 @@ data_lattice_table_rotated = [
"Men", "Men",
"Women", "Women",
], ],
["Kerala", "5738", "6633", "8864", "8297", "245", "2161", "3195", "1645", "2391"], ["Kerala", "5738", "6633", "8864", "8297", "245", "2161", "3195", "1645",
"2391"],
[ [
"Tamil Nadu", "Tamil Nadu",
"7387", "7387",
@ -2503,11 +2531,16 @@ data_lattice_table_rotated = [
"1417", "1417",
"1599", "1599",
], ],
["Gujarat", "4403", "5374", "4866", "9645", "477", "2687", "3021", "2122", "2503"], ["Gujarat", "4403", "5374", "4866", "9645", "477", "2687", "3021", "2122",
["Madhya Pradesh", "*", "*", "*", "7942", "470", "1965", "2150", "1579", "1709"], "2503"],
["Orissa", "3756", "5540", "12024", "8473", "398", "2040", "2624", "1093", "1628"], ["Madhya Pradesh", "*", "*", "*", "7942", "470", "1965", "2150", "1579",
["West Bengal", "*", "*", "*", "8047", "423", "2058", "2743", "1413", "2027"], "1709"],
["Uttar Pradesh", "*", "*", "*", "9860", "581", "2139", "2415", "1185", "1366"], ["Orissa", "3756", "5540", "12024", "8473", "398", "2040", "2624", "1093",
"1628"],
["West Bengal", "*", "*", "*", "8047", "423", "2058", "2743", "1413",
"2027"],
["Uttar Pradesh", "*", "*", "*", "9860", "581", "2139", "2415", "1185",
"1366"],
[ [
"Pooled", "Pooled",
"38742", "38742",
@ -2573,7 +2606,8 @@ data_lattice_two_tables_2 = [
] ]
data_lattice_table_regions = [ data_lattice_table_regions = [
["Età dellAssicurato \nallepoca del decesso", "Misura % di \nmaggiorazione"], ["Età dellAssicurato \nallepoca del decesso",
"Misura % di \nmaggiorazione"],
["18-75", "1,00%"], ["18-75", "1,00%"],
["76-80", "0,50%"], ["76-80", "0,50%"],
["81 in poi", "0,10%"], ["81 in poi", "0,10%"],
@ -2596,10 +2630,12 @@ data_lattice_table_areas = [
["Kerala", "2400", "7.2", "0.5", "25.3", "20.1", "41.5", "5.5", ""], ["Kerala", "2400", "7.2", "0.5", "25.3", "20.1", "41.5", "5.5", ""],
["Tamil Nadu", "2400", "21.4", "2.3", "8.8", "35.5", "25.8", "6.2", ""], ["Tamil Nadu", "2400", "21.4", "2.3", "8.8", "35.5", "25.8", "6.2", ""],
["Karnataka", "2399", "37.4", "2.8", "12.5", "18.3", "23.1", "5.8", ""], ["Karnataka", "2399", "37.4", "2.8", "12.5", "18.3", "23.1", "5.8", ""],
["Andhra Pradesh", "2400", "54.0", "1.7", "8.4", "13.2", "18.8", "3.9", ""], ["Andhra Pradesh", "2400", "54.0", "1.7", "8.4", "13.2", "18.8", "3.9",
""],
["Maharashtra", "2400", "22.0", "0.9", "17.3", "20.3", "32.6", "7.0", ""], ["Maharashtra", "2400", "22.0", "0.9", "17.3", "20.3", "32.6", "7.0", ""],
["Gujarat", "2390", "28.6", "0.1", "14.4", "23.1", "26.9", "6.8", ""], ["Gujarat", "2390", "28.6", "0.1", "14.4", "23.1", "26.9", "6.8", ""],
["Madhya Pradesh", "2402", "29.1", "3.4", "8.5", "35.1", "13.3", "10.6", ""], ["Madhya Pradesh", "2402", "29.1", "3.4", "8.5", "35.1", "13.3", "10.6",
""],
["Orissa", "2405", "33.2", "1.0", "10.4", "25.7", "21.2", "8.5", ""], ["Orissa", "2405", "33.2", "1.0", "10.4", "25.7", "21.2", "8.5", ""],
["West Bengal", "2293", "41.7", "4.4", "13.2", "17.1", "21.2", "2.4", ""], ["West Bengal", "2293", "41.7", "4.4", "13.2", "17.1", "21.2", "2.4", ""],
["Uttar Pradesh", "2400", "35.3", "2.1", "4.5", "23.3", "27.1", "7.6", ""], ["Uttar Pradesh", "2400", "35.3", "2.1", "4.5", "23.3", "27.1", "7.6", ""],
@ -2650,7 +2686,8 @@ data_lattice_process_background = [
"3,658", "3,658",
"3,183", "3,183",
], ],
["Kerala", "23.2.2010 to \n11.3.2010", "9", "17", "1.42", "3,559", "2,173", "855"], ["Kerala", "23.2.2010 to \n11.3.2010", "9", "17", "1.42", "3,559", "2,173",
"855"],
["Total", "", "47", "92", "11.81", "22,455", "19,584", "10,644"], ["Total", "", "47", "92", "11.81", "22,455", "19,584", "10,644"],
] ]
@ -2689,7 +2726,8 @@ data_lattice_copy_text = [
["COHS", "San Mateo", "Health Plan of San Mateo", "113,202"], ["COHS", "San Mateo", "Health Plan of San Mateo", "113,202"],
["COHS", "Ventura", "Gold Coast Health Plan", "202,217"], ["COHS", "Ventura", "Gold Coast Health Plan", "202,217"],
["COHS", "Total COHS Enrollment", "", "2,176,064"], ["COHS", "Total COHS Enrollment", "", "2,176,064"],
["Subtotal for Two-Plan, Regional Model, GMC and COHS", "", "", "10,132,022"], ["Subtotal for Two-Plan, Regional Model, GMC and COHS", "", "",
"10,132,022"],
["PCCM", "Los Angeles", "AIDS Healthcare Foundation", "828"], ["PCCM", "Los Angeles", "AIDS Healthcare Foundation", "828"],
["PCCM", "San Francisco", "Family Mosaic", "25"], ["PCCM", "San Francisco", "Family Mosaic", "25"],
["PCCM", "Total PHP Enrollment", "", "853"], ["PCCM", "Total PHP Enrollment", "", "853"],
@ -2721,7 +2759,8 @@ data_lattice_shift_text_left_top = [
], ],
["Blood Pressure #", "2400", "Men (≥ 18yrs)", "10%", "95%", "20%", "1728"], ["Blood Pressure #", "2400", "Men (≥ 18yrs)", "10%", "95%", "20%", "1728"],
["", "", "Women (≥ 18 yrs)", "", "", "", "1728"], ["", "", "Women (≥ 18 yrs)", "", "", "", "1728"],
["Fasting blood glucose", "2400", "Men (≥ 18 yrs)", "5%", "95%", "20%", "1825"], ["Fasting blood glucose", "2400", "Men (≥ 18 yrs)", "5%", "95%", "20%",
"1825"],
["", "", "Women (≥ 18 yrs)", "", "", "", "1825"], ["", "", "Women (≥ 18 yrs)", "", "", "", "1825"],
[ [
"Knowledge &\nPractices on HTN &\nDM", "Knowledge &\nPractices on HTN &\nDM",
@ -2746,7 +2785,8 @@ data_lattice_shift_text_disable = [
"Sample size\nper State", "Sample size\nper State",
], ],
["Anthropometry", "", "", "", "", "", ""], ["Anthropometry", "", "", "", "", "", ""],
["Clinical Examination", "2400", "", "All the available individuals", "", "", ""], ["Clinical Examination", "2400", "", "All the available individuals",
"", "", ""],
["History of morbidity", "", "", "", "", "", ""], ["History of morbidity", "", "", "", "", "", ""],
[ [
"Diet survey", "Diet survey",
@ -2758,9 +2798,11 @@ data_lattice_shift_text_disable = [
"", "",
], ],
["", "", "Men (≥ 18yrs)", "", "", "", "1728"], ["", "", "Men (≥ 18yrs)", "", "", "", "1728"],
["Blood Pressure #", "2400", "Women (≥ 18 yrs)", "10%", "95%", "20%", "1728"], ["Blood Pressure #", "2400", "Women (≥ 18 yrs)", "10%", "95%", "20%",
"1728"],
["", "", "Men (≥ 18 yrs)", "", "", "", "1825"], ["", "", "Men (≥ 18 yrs)", "", "", "", "1825"],
["Fasting blood glucose", "2400", "Women (≥ 18 yrs)", "5%", "95%", "20%", "1825"], ["Fasting blood glucose", "2400", "Women (≥ 18 yrs)", "5%", "95%", "20%",
"1825"],
[ [
"Knowledge &\nPractices on HTN &", "Knowledge &\nPractices on HTN &",
"2400", "2400",
@ -2785,7 +2827,8 @@ data_lattice_shift_text_right_bottom = [
], ],
["Anthropometry", "", "", "", "", "", ""], ["Anthropometry", "", "", "", "", "", ""],
["Clinical Examination", "", "", "", "", "", ""], ["Clinical Examination", "", "", "", "", "", ""],
["History of morbidity", "2400", "", "", "", "", "All the available individuals"], ["History of morbidity", "2400", "", "", "", "",
"All the available individuals"],
[ [
"Diet survey", "Diet survey",
"1200", "1200",
@ -2796,9 +2839,11 @@ data_lattice_shift_text_right_bottom = [
"All the individuals partaking meals in the HH", "All the individuals partaking meals in the HH",
], ],
["", "", "Men (≥ 18yrs)", "", "", "", "1728"], ["", "", "Men (≥ 18yrs)", "", "", "", "1728"],
["Blood Pressure #", "2400", "Women (≥ 18 yrs)", "10%", "95%", "20%", "1728"], ["Blood Pressure #", "2400", "Women (≥ 18 yrs)", "10%", "95%", "20%",
"1728"],
["", "", "Men (≥ 18 yrs)", "", "", "", "1825"], ["", "", "Men (≥ 18 yrs)", "", "", "", "1825"],
["Fasting blood glucose", "2400", "Women (≥ 18 yrs)", "5%", "95%", "20%", "1825"], ["Fasting blood glucose", "2400", "Women (≥ 18 yrs)", "5%", "95%", "20%",
"1825"],
["", "2400", "Men (≥ 18 yrs)", "-", "-", "-", "1728"], ["", "2400", "Men (≥ 18 yrs)", "-", "-", "-", "1728"],
[ [
"Knowledge &\nPractices on HTN &\nDM", "Knowledge &\nPractices on HTN &\nDM",
@ -2820,7 +2865,7 @@ data_arabic = [
] ]
data_stream_layout_kwargs = [ data_stream_layout_kwargs = [
["V i n s a u Ve r r e", ""], ["V i n s a u V e r r e", ""],
["Les Blancs", "12.5CL"], ["Les Blancs", "12.5CL"],
["A.O.P Côtes du Rhône", ""], ["A.O.P Côtes du Rhône", ""],
["Domaine de la Guicharde « Autour de la chapelle » 2016", "8 €"], ["Domaine de la Guicharde « Autour de la chapelle » 2016", "8 €"],

Binary file not shown.

Before

Width:  |  Height:  |  Size: 8.2 KiB

After

Width:  |  Height:  |  Size: 8.2 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 48 KiB

After

Width:  |  Height:  |  Size: 33 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 46 KiB

After

Width:  |  Height:  |  Size: 46 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 6.7 KiB

After

Width:  |  Height:  |  Size: 6.7 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 13 KiB

After

Width:  |  Height:  |  Size: 14 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 9.7 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 8.8 KiB

After

Width:  |  Height:  |  Size: 8.9 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 18 KiB

After

Width:  |  Height:  |  Size: 19 KiB

View File

@ -19,10 +19,16 @@ def test_help_output():
output = result.output output = result.output
assert prog_name == "camelot" assert prog_name == "camelot"
assert result.output.startswith("Usage: %(prog_name)s [OPTIONS] COMMAND" % locals()) assert result.output.startswith(
"Usage: %(prog_name)s [OPTIONS] COMMAND" %
locals()
)
assert all( assert all(
v in result.output v in result.output
for v in ["Options:", "--version", "--help", "Commands:", "lattice", "stream"] for v in [
"Options:", "--version", "--help", "Commands:", "lattice",
"stream"
]
) )
@ -120,21 +126,24 @@ def test_cli_output_format():
# json # json
result = runner.invoke( result = runner.invoke(
cli, cli,
["--format", "json", "--output", outfile.format("json"), "stream", infile], ["--format", "json", "--output", outfile.format("json"), "stream",
infile],
) )
assert result.exit_code == 0 assert result.exit_code == 0
# excel # excel
result = runner.invoke( result = runner.invoke(
cli, cli,
["--format", "excel", "--output", outfile.format("xlsx"), "stream", infile], ["--format", "excel", "--output", outfile.format("xlsx"), "stream",
infile],
) )
assert result.exit_code == 0 assert result.exit_code == 0
# html # html
result = runner.invoke( result = runner.invoke(
cli, cli,
["--format", "html", "--output", outfile.format("html"), "stream", infile], ["--format", "html", "--output", outfile.format("html"), "stream",
infile],
) )
assert result.exit_code == 0 assert result.exit_code == 0
@ -166,6 +175,10 @@ def test_cli_quiet():
assert "No tables found on page-1" in result.output assert "No tables found on page-1" in result.output
result = runner.invoke( result = runner.invoke(
cli, ["--quiet", "--format", "csv", "--output", outfile, "stream", infile] cli,
[
"--quiet", "--format", "csv", "--output", outfile, "stream",
infile
]
) )
assert "No tables found on page-1" not in result.output assert "No tables found on page-1" not in result.output

View File

@ -11,12 +11,15 @@ from camelot.__version__ import generate_version
from .data import * from .data import *
testdir = os.path.dirname(os.path.abspath(__file__)) testdir = os.path.dirname(os.path.abspath(__file__))
testdir = os.path.join(testdir, "files") testdir = os.path.join(testdir, "files")
def test_parsing_report(): def test_parsing_report():
parsing_report = {"accuracy": 99.02, "whitespace": 12.24, "order": 1, "page": 1} parsing_report = {
"accuracy": 99.02, "whitespace": 12.24, "order": 1, "page": 1
}
filename = os.path.join(testdir, "foo.pdf") filename = os.path.join(testdir, "foo.pdf")
tables = camelot.read_pdf(filename) tables = camelot.read_pdf(filename)
@ -28,9 +31,17 @@ def test_password():
filename = os.path.join(testdir, "health_protected.pdf") filename = os.path.join(testdir, "health_protected.pdf")
tables = camelot.read_pdf(filename, password="ownerpass", flavor="stream") tables = camelot.read_pdf(filename, password="ownerpass", flavor="stream")
<<<<<<< HEAD
assert_frame_equal(df, tables[0].df) assert_frame_equal(df, tables[0].df)
tables = camelot.read_pdf(filename, password="userpass", flavor="stream") tables = camelot.read_pdf(filename, password="userpass", flavor="stream")
=======
assert len(tables) == 1
assert_frame_equal(df, tables[0].df)
tables = camelot.read_pdf(filename, password="userpass", flavor="stream")
assert len(tables) == 1
>>>>>>> Fix unit tests, lint, drop Python 2 support
assert_frame_equal(df, tables[0].df) assert_frame_equal(df, tables[0].df)
@ -229,9 +240,9 @@ def test_repr():
tables = camelot.read_pdf(filename) tables = camelot.read_pdf(filename)
assert repr(tables) == "<TableList n=1>" assert repr(tables) == "<TableList n=1>"
assert repr(tables[0]) == "<Table shape=(7, 7)>" assert repr(tables[0]) == "<Table shape=(7, 7)>"
assert ( assert \
repr(tables[0].cells[0][0]) == "<Cell x1=120.48 y1=218.43 x2=164.64 y2=233.77>" repr(tables[0].cells[0][0]) == \
) "<Cell x1=120.48 y1=218.43 x2=164.64 y2=233.77>"
def test_pages(): def test_pages():
@ -239,22 +250,23 @@ def test_pages():
tables = camelot.read_pdf(url) tables = camelot.read_pdf(url)
assert repr(tables) == "<TableList n=1>" assert repr(tables) == "<TableList n=1>"
assert repr(tables[0]) == "<Table shape=(7, 7)>" assert repr(tables[0]) == "<Table shape=(7, 7)>"
assert ( assert \
repr(tables[0].cells[0][0]) == "<Cell x1=120.48 y1=218.43 x2=164.64 y2=233.77>" repr(tables[0].cells[0][0]) == \
) "<Cell x1=120.48 y1=218.43 x2=164.64 y2=233.77>"
tables = camelot.read_pdf(url, pages="1-end") tables = camelot.read_pdf(url, pages="1-end")
assert repr(tables) == "<TableList n=1>" assert repr(tables) == "<TableList n=1>"
assert repr(tables[0]) == "<Table shape=(7, 7)>" assert repr(tables[0]) == "<Table shape=(7, 7)>"
assert ( assert \
repr(tables[0].cells[0][0]) == "<Cell x1=120.48 y1=218.43 x2=164.64 y2=233.77>" repr(tables[0].cells[0][0]) == \
) "<Cell x1=120.48 y1=218.43 x2=164.64 y2=233.77>"
tables = camelot.read_pdf(url, pages="all") tables = camelot.read_pdf(url, pages="all")
assert repr(tables) == "<TableList n=1>" assert repr(tables) == "<TableList n=1>"
assert repr(tables[0]) == "<Table shape=(7, 7)>" assert repr(tables[0]) == "<Table shape=(7, 7)>"
assert ( assert (
repr(tables[0].cells[0][0]) == "<Cell x1=120.48 y1=218.43 x2=164.64 y2=233.77>" repr(tables[0].cells[0][0]) ==
"<Cell x1=120.48 y1=218.43 x2=164.64 y2=233.77>"
) )
@ -264,7 +276,8 @@ def test_url():
assert repr(tables) == "<TableList n=1>" assert repr(tables) == "<TableList n=1>"
assert repr(tables[0]) == "<Table shape=(7, 7)>" assert repr(tables[0]) == "<Table shape=(7, 7)>"
assert ( assert (
repr(tables[0].cells[0][0]) == "<Cell x1=120.48 y1=218.43 x2=164.64 y2=233.77>" repr(tables[0].cells[0][0]) ==
"<Cell x1=120.48 y1=218.43 x2=164.64 y2=233.77>"
) )
@ -284,7 +297,12 @@ def test_table_order():
return t return t
table_list = TableList( table_list = TableList(
[_make_table(2, 1), _make_table(1, 1), _make_table(3, 4), _make_table(1, 2)] [
_make_table(2, 1),
_make_table(1, 1),
_make_table(3, 4),
_make_table(1, 2)
]
) )
assert [(t.page, t.order) for t in sorted(table_list)] == [ assert [(t.page, t.order) for t in sorted(table_list)] == [

View File

@ -4,13 +4,30 @@ import os
import pytest import pytest
import matplotlib
import camelot import camelot
# The version of Matplotlib has an impact on some of the tests. Unfortunately,
# we can't enforce usage of a recent version of MatplotLib without dropping
# support for Python 3.6.
# To check the version of matplotlib installed:
# pip freeze | grep matplotlib
# To force upgrade:
# pip install --upgrade --force-reinstall matplotlib
# To force usage of a Python 3.6 compatible version:
# pip install "matplotlib==2.2.5"
# This condition can be removed in favor of a version requirement bump for
# matplotlib once support for Python 3.5 is dropped.
LEGACY_MATPLOTLIB = matplotlib.__version__ < "3.2.1"
testdir = os.path.dirname(os.path.abspath(__file__)) testdir = os.path.dirname(os.path.abspath(__file__))
testdir = os.path.join(testdir, "files") testdir = os.path.join(testdir, "files")
@pytest.mark.skipif(LEGACY_MATPLOTLIB,
reason="depends on a recent version of MatPlotLib")
@pytest.mark.mpl_image_compare( @pytest.mark.mpl_image_compare(
baseline_dir="files/baseline_plots", remove_text=True) baseline_dir="files/baseline_plots", remove_text=True)
def test_text_plot(): def test_text_plot():
@ -26,6 +43,15 @@ def test_grid_plot():
tables = camelot.read_pdf(filename) tables = camelot.read_pdf(filename)
return camelot.plot(tables[0], kind='grid') return camelot.plot(tables[0], kind='grid')
@pytest.mark.skipif(LEGACY_MATPLOTLIB,
reason="depends on a recent version of MatPlotLib")
@pytest.mark.mpl_image_compare(
baseline_dir="files/baseline_plots", remove_text=True)
def test_stream_grid_plot():
filename = os.path.join(testdir, "foo.pdf")
tables = camelot.read_pdf(filename, flavor="stream")
return camelot.plot(tables[0], kind='grid')
@pytest.mark.mpl_image_compare( @pytest.mark.mpl_image_compare(
baseline_dir="files/baseline_plots", remove_text=True) baseline_dir="files/baseline_plots", remove_text=True)
@ -35,6 +61,8 @@ def test_lattice_contour_plot():
return camelot.plot(tables[0], kind='contour') return camelot.plot(tables[0], kind='contour')
@pytest.mark.skipif(LEGACY_MATPLOTLIB,
reason="depends on a recent version of MatPlotLib")
@pytest.mark.mpl_image_compare( @pytest.mark.mpl_image_compare(
baseline_dir="files/baseline_plots", remove_text=True) baseline_dir="files/baseline_plots", remove_text=True)
def test_stream_contour_plot(): def test_stream_contour_plot():
@ -51,6 +79,8 @@ def test_line_plot():
return camelot.plot(tables[0], kind='line') return camelot.plot(tables[0], kind='line')
@pytest.mark.skipif(LEGACY_MATPLOTLIB,
reason="depends on a recent version of MatPlotLib")
@pytest.mark.mpl_image_compare( @pytest.mark.mpl_image_compare(
baseline_dir="files/baseline_plots", remove_text=True) baseline_dir="files/baseline_plots", remove_text=True)
def test_joint_plot(): def test_joint_plot():
@ -59,6 +89,8 @@ def test_joint_plot():
return camelot.plot(tables[0], kind='joint') return camelot.plot(tables[0], kind='joint')
@pytest.mark.skipif(LEGACY_MATPLOTLIB,
reason="depends on a recent version of MatPlotLib")
@pytest.mark.mpl_image_compare( @pytest.mark.mpl_image_compare(
baseline_dir="files/baseline_plots", remove_text=True) baseline_dir="files/baseline_plots", remove_text=True)
def test_textedge_plot(): def test_textedge_plot():