Add support for parsing PDFs in parallel
Parse in parallel using multiprocessing library using available CPUspull/237/head
parent
7709e58d64
commit
63161fe379
|
|
@ -2,6 +2,7 @@
|
||||||
|
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
|
import multiprocessing as mp
|
||||||
|
|
||||||
from PyPDF2 import PdfFileReader, PdfFileWriter
|
from PyPDF2 import PdfFileReader, PdfFileWriter
|
||||||
|
|
||||||
|
|
@ -140,7 +141,12 @@ class PDFHandler(object):
|
||||||
instream.close()
|
instream.close()
|
||||||
|
|
||||||
def parse(
|
def parse(
|
||||||
self, flavor="lattice", suppress_stdout=False, layout_kwargs={}, **kwargs
|
self,
|
||||||
|
flavor="lattice",
|
||||||
|
suppress_stdout=False,
|
||||||
|
parallel=False,
|
||||||
|
layout_kwargs={},
|
||||||
|
**kwargs
|
||||||
):
|
):
|
||||||
"""Extracts tables by calling parser.get_tables on all single
|
"""Extracts tables by calling parser.get_tables on all single
|
||||||
page PDFs.
|
page PDFs.
|
||||||
|
|
@ -150,8 +156,10 @@ class PDFHandler(object):
|
||||||
flavor : str (default: 'lattice')
|
flavor : str (default: 'lattice')
|
||||||
The parsing method to use ('lattice' or 'stream').
|
The parsing method to use ('lattice' or 'stream').
|
||||||
Lattice is used by default.
|
Lattice is used by default.
|
||||||
suppress_stdout : str (default: False)
|
suppress_stdout : bool (default: False)
|
||||||
Suppress logs and warnings.
|
Suppress logs and warnings.
|
||||||
|
parallel : bool (default: False)
|
||||||
|
Process pages in parallel using all available cpu cores.
|
||||||
layout_kwargs : dict, optional (default: {})
|
layout_kwargs : dict, optional (default: {})
|
||||||
A dict of `pdfminer.layout.LAParams <https://github.com/euske/pdfminer/blob/master/pdfminer/layout.py#L33>`_ kwargs.
|
A dict of `pdfminer.layout.LAParams <https://github.com/euske/pdfminer/blob/master/pdfminer/layout.py#L33>`_ kwargs.
|
||||||
kwargs : dict
|
kwargs : dict
|
||||||
|
|
@ -164,16 +172,54 @@ class PDFHandler(object):
|
||||||
|
|
||||||
"""
|
"""
|
||||||
tables = []
|
tables = []
|
||||||
with TemporaryDirectory() as tempdir:
|
|
||||||
for p in self.pages:
|
|
||||||
self._save_page(self.filepath, p, tempdir)
|
|
||||||
pages = [
|
|
||||||
os.path.join(tempdir, f"page-{p}.pdf") for p in self.pages
|
|
||||||
]
|
|
||||||
parser = Lattice(**kwargs) if flavor == "lattice" else Stream(**kwargs)
|
parser = Lattice(**kwargs) if flavor == "lattice" else Stream(**kwargs)
|
||||||
for p in pages:
|
with TemporaryDirectory() as tempdir:
|
||||||
t = parser.extract_tables(
|
cpu_count = mp.cpu_count()
|
||||||
p, suppress_stdout=suppress_stdout, layout_kwargs=layout_kwargs
|
# Using multiprocessing only when cpu_count > 1 to prevent
|
||||||
)
|
# a stallness issue when cpu_count is 1
|
||||||
|
if parallel and cpu_count > 1:
|
||||||
|
with mp.get_context("spawn").Pool(processes=cpu_count) as pool:
|
||||||
|
jobs = [
|
||||||
|
pool.apply_async(
|
||||||
|
self._parse_page,
|
||||||
|
(p, tempdir, parser, suppress_stdout, layout_kwargs)
|
||||||
|
) for p in self.pages
|
||||||
|
]
|
||||||
|
for j in jobs:
|
||||||
|
t = j.get()
|
||||||
|
tables.extend(t)
|
||||||
|
else:
|
||||||
|
for p in self.pages:
|
||||||
|
t = self._parse_page(p, tempdir, parser, suppress_stdout, layout_kwargs)
|
||||||
tables.extend(t)
|
tables.extend(t)
|
||||||
return TableList(sorted(tables))
|
return TableList(sorted(tables))
|
||||||
|
|
||||||
|
def _parse_page(
|
||||||
|
self, page, tempdir, parser, suppress_stdout, layout_kwargs
|
||||||
|
):
|
||||||
|
"""Extracts tables by calling parser.get_tables on a single
|
||||||
|
page PDF.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
page : str
|
||||||
|
Page number to parse
|
||||||
|
parser : Lattice or Stream
|
||||||
|
The parser to use (Lattice or Stream).
|
||||||
|
suppress_stdout : bool
|
||||||
|
Suppress logs and warnings.
|
||||||
|
layout_kwargs : dict, optional (default: {})
|
||||||
|
A dict of `pdfminer.layout.LAParams <https://github.com/euske/pdfminer/blob/master/pdfminer/layout.py#L33>`_ kwargs.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
tables : camelot.core.TableList
|
||||||
|
List of tables found in PDF.
|
||||||
|
|
||||||
|
"""
|
||||||
|
self._save_page(self.filepath, page, tempdir)
|
||||||
|
page_path = os.path.join(tempdir, f"page-{page}.pdf")
|
||||||
|
tables = parser.extract_tables(
|
||||||
|
page_path, suppress_stdout=suppress_stdout, layout_kwargs=layout_kwargs
|
||||||
|
)
|
||||||
|
return tables
|
||||||
|
|
|
||||||
|
|
@ -12,6 +12,7 @@ def read_pdf(
|
||||||
password=None,
|
password=None,
|
||||||
flavor="lattice",
|
flavor="lattice",
|
||||||
suppress_stdout=False,
|
suppress_stdout=False,
|
||||||
|
parallel=False,
|
||||||
layout_kwargs={},
|
layout_kwargs={},
|
||||||
**kwargs
|
**kwargs
|
||||||
):
|
):
|
||||||
|
|
@ -34,6 +35,8 @@ def read_pdf(
|
||||||
Lattice is used by default.
|
Lattice is used by default.
|
||||||
suppress_stdout : bool, optional (default: True)
|
suppress_stdout : bool, optional (default: True)
|
||||||
Print all logs and warnings.
|
Print all logs and warnings.
|
||||||
|
parallel : bool, optional (default: False)
|
||||||
|
Process pages in parallel using all available cpu cores.
|
||||||
layout_kwargs : dict, optional (default: {})
|
layout_kwargs : dict, optional (default: {})
|
||||||
A dict of `pdfminer.layout.LAParams <https://github.com/euske/pdfminer/blob/master/pdfminer/layout.py#L33>`_ kwargs.
|
A dict of `pdfminer.layout.LAParams <https://github.com/euske/pdfminer/blob/master/pdfminer/layout.py#L33>`_ kwargs.
|
||||||
table_areas : list, optional (default: None)
|
table_areas : list, optional (default: None)
|
||||||
|
|
@ -113,6 +116,7 @@ def read_pdf(
|
||||||
tables = p.parse(
|
tables = p.parse(
|
||||||
flavor=flavor,
|
flavor=flavor,
|
||||||
suppress_stdout=suppress_stdout,
|
suppress_stdout=suppress_stdout,
|
||||||
|
parallel=parallel,
|
||||||
layout_kwargs=layout_kwargs,
|
layout_kwargs=layout_kwargs,
|
||||||
**kwargs
|
**kwargs
|
||||||
)
|
)
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,8 @@
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
def pytest_generate_tests(metafunc):
|
||||||
|
if "parallel" in metafunc.fixturenames:
|
||||||
|
metafunc.parametrize("parallel", [
|
||||||
|
pytest.param(True, id="parallel=True"),
|
||||||
|
pytest.param(False, id="parallel=False")
|
||||||
|
])
|
||||||
|
|
@ -14,219 +14,275 @@ from .data import *
|
||||||
testdir = os.path.dirname(os.path.abspath(__file__))
|
testdir = os.path.dirname(os.path.abspath(__file__))
|
||||||
testdir = os.path.join(testdir, "files")
|
testdir = os.path.join(testdir, "files")
|
||||||
|
|
||||||
|
def test_parsing_report(parallel):
|
||||||
def test_parsing_report():
|
|
||||||
parsing_report = {"accuracy": 99.02, "whitespace": 12.24, "order": 1, "page": 1}
|
parsing_report = {"accuracy": 99.02, "whitespace": 12.24, "order": 1, "page": 1}
|
||||||
|
|
||||||
filename = os.path.join(testdir, "foo.pdf")
|
filename = os.path.join(testdir, "foo.pdf")
|
||||||
tables = camelot.read_pdf(filename)
|
tables = camelot.read_pdf(filename, parallel=parallel)
|
||||||
assert tables[0].parsing_report == parsing_report
|
assert tables[0].parsing_report == parsing_report
|
||||||
|
|
||||||
|
|
||||||
def test_password():
|
|
||||||
|
def test_password(parallel):
|
||||||
df = pd.DataFrame(data_stream)
|
df = pd.DataFrame(data_stream)
|
||||||
|
|
||||||
filename = os.path.join(testdir, "health_protected.pdf")
|
filename = os.path.join(testdir, "health_protected.pdf")
|
||||||
tables = camelot.read_pdf(filename, password="ownerpass", flavor="stream")
|
tables = camelot.read_pdf(filename, password="ownerpass", flavor="stream", parallel=parallel)
|
||||||
assert_frame_equal(df, tables[0].df)
|
assert_frame_equal(df, tables[0].df)
|
||||||
|
|
||||||
tables = camelot.read_pdf(filename, password="userpass", flavor="stream")
|
tables = camelot.read_pdf(filename, password="userpass", flavor="stream", parallel=parallel)
|
||||||
assert_frame_equal(df, tables[0].df)
|
assert_frame_equal(df, tables[0].df)
|
||||||
|
|
||||||
|
|
||||||
def test_stream():
|
|
||||||
|
def test_stream(parallel):
|
||||||
df = pd.DataFrame(data_stream)
|
df = pd.DataFrame(data_stream)
|
||||||
|
|
||||||
filename = os.path.join(testdir, "health.pdf")
|
filename = os.path.join(testdir, "health.pdf")
|
||||||
tables = camelot.read_pdf(filename, flavor="stream")
|
tables = camelot.read_pdf(filename, flavor="stream", parallel=parallel)
|
||||||
assert_frame_equal(df, tables[0].df)
|
assert_frame_equal(df, tables[0].df)
|
||||||
|
|
||||||
|
|
||||||
def test_stream_table_rotated():
|
|
||||||
|
def test_stream_table_rotated(parallel):
|
||||||
df = pd.DataFrame(data_stream_table_rotated)
|
df = pd.DataFrame(data_stream_table_rotated)
|
||||||
|
|
||||||
filename = os.path.join(testdir, "clockwise_table_2.pdf")
|
filename = os.path.join(testdir, "clockwise_table_2.pdf")
|
||||||
tables = camelot.read_pdf(filename, flavor="stream")
|
tables = camelot.read_pdf(filename, flavor="stream", parallel=parallel)
|
||||||
assert_frame_equal(df, tables[0].df)
|
assert_frame_equal(df, tables[0].df)
|
||||||
|
|
||||||
filename = os.path.join(testdir, "anticlockwise_table_2.pdf")
|
filename = os.path.join(testdir, "anticlockwise_table_2.pdf")
|
||||||
tables = camelot.read_pdf(filename, flavor="stream")
|
tables = camelot.read_pdf(filename, flavor="stream", parallel=parallel)
|
||||||
assert_frame_equal(df, tables[0].df)
|
assert_frame_equal(df, tables[0].df)
|
||||||
|
|
||||||
|
|
||||||
def test_stream_two_tables():
|
|
||||||
|
def test_stream_two_tables(parallel):
|
||||||
df1 = pd.DataFrame(data_stream_two_tables_1)
|
df1 = pd.DataFrame(data_stream_two_tables_1)
|
||||||
df2 = pd.DataFrame(data_stream_two_tables_2)
|
df2 = pd.DataFrame(data_stream_two_tables_2)
|
||||||
|
|
||||||
filename = os.path.join(testdir, "tabula/12s0324.pdf")
|
filename = os.path.join(testdir, "tabula/12s0324.pdf")
|
||||||
tables = camelot.read_pdf(filename, flavor="stream")
|
tables = camelot.read_pdf(filename, flavor="stream", parallel=parallel)
|
||||||
|
|
||||||
assert len(tables) == 2
|
assert len(tables) == 2
|
||||||
assert df1.equals(tables[0].df)
|
assert df1.equals(tables[0].df)
|
||||||
assert df2.equals(tables[1].df)
|
assert df2.equals(tables[1].df)
|
||||||
|
|
||||||
|
|
||||||
def test_stream_table_regions():
|
|
||||||
|
def test_stream_table_regions(parallel):
|
||||||
df = pd.DataFrame(data_stream_table_areas)
|
df = pd.DataFrame(data_stream_table_areas)
|
||||||
|
|
||||||
filename = os.path.join(testdir, "tabula/us-007.pdf")
|
filename = os.path.join(testdir, "tabula/us-007.pdf")
|
||||||
tables = camelot.read_pdf(
|
tables = camelot.read_pdf(
|
||||||
filename, flavor="stream", table_regions=["320,460,573,335"]
|
filename, flavor="stream",
|
||||||
|
parallel=parallel,
|
||||||
|
table_regions=["320,460,573,335"]
|
||||||
)
|
)
|
||||||
assert_frame_equal(df, tables[0].df)
|
assert_frame_equal(df, tables[0].df)
|
||||||
|
|
||||||
|
|
||||||
def test_stream_table_areas():
|
def test_stream_table_areas(parallel):
|
||||||
df = pd.DataFrame(data_stream_table_areas)
|
df = pd.DataFrame(data_stream_table_areas)
|
||||||
|
|
||||||
filename = os.path.join(testdir, "tabula/us-007.pdf")
|
filename = os.path.join(testdir, "tabula/us-007.pdf")
|
||||||
tables = camelot.read_pdf(
|
tables = camelot.read_pdf(
|
||||||
filename, flavor="stream", table_areas=["320,500,573,335"]
|
filename, flavor="stream",
|
||||||
|
parallel=parallel,
|
||||||
|
table_areas=["320,500,573,335"]
|
||||||
)
|
)
|
||||||
assert_frame_equal(df, tables[0].df)
|
assert_frame_equal(df, tables[0].df)
|
||||||
|
|
||||||
|
|
||||||
def test_stream_columns():
|
def test_stream_columns(parallel):
|
||||||
df = pd.DataFrame(data_stream_columns)
|
df = pd.DataFrame(data_stream_columns)
|
||||||
|
|
||||||
filename = os.path.join(testdir, "mexican_towns.pdf")
|
filename = os.path.join(testdir, "mexican_towns.pdf")
|
||||||
tables = camelot.read_pdf(
|
tables = camelot.read_pdf(
|
||||||
filename, flavor="stream", columns=["67,180,230,425,475"], row_tol=10
|
filename,
|
||||||
|
flavor="stream",
|
||||||
|
parallel=parallel,
|
||||||
|
columns=["67,180,230,425,475"],
|
||||||
|
row_tol=10
|
||||||
)
|
)
|
||||||
assert_frame_equal(df, tables[0].df)
|
assert_frame_equal(df, tables[0].df)
|
||||||
|
|
||||||
|
|
||||||
def test_stream_split_text():
|
def test_stream_split_text(parallel):
|
||||||
df = pd.DataFrame(data_stream_split_text)
|
df = pd.DataFrame(data_stream_split_text)
|
||||||
|
|
||||||
filename = os.path.join(testdir, "tabula/m27.pdf")
|
filename = os.path.join(testdir, "tabula/m27.pdf")
|
||||||
tables = camelot.read_pdf(
|
tables = camelot.read_pdf(
|
||||||
filename,
|
filename,
|
||||||
flavor="stream",
|
flavor="stream",
|
||||||
|
parallel=parallel,
|
||||||
columns=["72,95,209,327,442,529,566,606,683"],
|
columns=["72,95,209,327,442,529,566,606,683"],
|
||||||
split_text=True,
|
split_text=True,
|
||||||
)
|
)
|
||||||
assert_frame_equal(df, tables[0].df)
|
assert_frame_equal(df, tables[0].df)
|
||||||
|
|
||||||
|
|
||||||
def test_stream_flag_size():
|
def test_stream_flag_size(parallel):
|
||||||
df = pd.DataFrame(data_stream_flag_size)
|
df = pd.DataFrame(data_stream_flag_size)
|
||||||
|
|
||||||
filename = os.path.join(testdir, "superscript.pdf")
|
filename = os.path.join(testdir, "superscript.pdf")
|
||||||
tables = camelot.read_pdf(filename, flavor="stream", flag_size=True)
|
|
||||||
assert_frame_equal(df, tables[0].df)
|
|
||||||
|
|
||||||
|
|
||||||
def test_stream_strip_text():
|
|
||||||
df = pd.DataFrame(data_stream_strip_text)
|
|
||||||
|
|
||||||
filename = os.path.join(testdir, "detect_vertical_false.pdf")
|
|
||||||
tables = camelot.read_pdf(filename, flavor="stream", strip_text=" ,\n")
|
|
||||||
assert_frame_equal(df, tables[0].df)
|
|
||||||
|
|
||||||
|
|
||||||
def test_stream_edge_tol():
|
|
||||||
df = pd.DataFrame(data_stream_edge_tol)
|
|
||||||
|
|
||||||
filename = os.path.join(testdir, "edge_tol.pdf")
|
|
||||||
tables = camelot.read_pdf(filename, flavor="stream", edge_tol=500)
|
|
||||||
assert_frame_equal(df, tables[0].df)
|
|
||||||
|
|
||||||
|
|
||||||
def test_stream_layout_kwargs():
|
|
||||||
df = pd.DataFrame(data_stream_layout_kwargs)
|
|
||||||
|
|
||||||
filename = os.path.join(testdir, "detect_vertical_false.pdf")
|
|
||||||
tables = camelot.read_pdf(
|
tables = camelot.read_pdf(
|
||||||
filename, flavor="stream", layout_kwargs={"detect_vertical": False}
|
filename,
|
||||||
|
flavor="stream",
|
||||||
|
parallel=parallel,
|
||||||
|
flag_size=True
|
||||||
)
|
)
|
||||||
assert_frame_equal(df, tables[0].df)
|
assert_frame_equal(df, tables[0].df)
|
||||||
|
|
||||||
|
|
||||||
def test_lattice():
|
def test_stream_strip_text(parallel):
|
||||||
|
df = pd.DataFrame(data_stream_strip_text)
|
||||||
|
|
||||||
|
filename = os.path.join(testdir, "detect_vertical_false.pdf")
|
||||||
|
tables = camelot.read_pdf(
|
||||||
|
filename, flavor="stream",
|
||||||
|
parallel=parallel,
|
||||||
|
strip_text=" ,\n"
|
||||||
|
)
|
||||||
|
assert_frame_equal(df, tables[0].df)
|
||||||
|
|
||||||
|
|
||||||
|
def test_stream_edge_tol(parallel):
|
||||||
|
df = pd.DataFrame(data_stream_edge_tol)
|
||||||
|
|
||||||
|
filename = os.path.join(testdir, "edge_tol.pdf")
|
||||||
|
tables = camelot.read_pdf(
|
||||||
|
filename, flavor="stream",
|
||||||
|
parallel=parallel,
|
||||||
|
edge_tol=500
|
||||||
|
)
|
||||||
|
assert_frame_equal(df, tables[0].df)
|
||||||
|
|
||||||
|
|
||||||
|
def test_stream_layout_kwargs(parallel):
|
||||||
|
df = pd.DataFrame(data_stream_layout_kwargs)
|
||||||
|
|
||||||
|
filename = os.path.join(testdir, "detect_vertical_false.pdf")
|
||||||
|
tables = camelot.read_pdf(
|
||||||
|
filename,
|
||||||
|
flavor="stream",
|
||||||
|
parallel=parallel,
|
||||||
|
layout_kwargs={"detect_vertical": False}
|
||||||
|
)
|
||||||
|
assert_frame_equal(df, tables[0].df)
|
||||||
|
|
||||||
|
|
||||||
|
def test_lattice(parallel):
|
||||||
df = pd.DataFrame(data_lattice)
|
df = pd.DataFrame(data_lattice)
|
||||||
|
|
||||||
filename = os.path.join(
|
filename = os.path.join(
|
||||||
testdir, "tabula/icdar2013-dataset/competition-dataset-us/us-030.pdf"
|
testdir, "tabula/icdar2013-dataset/competition-dataset-us/us-030.pdf"
|
||||||
)
|
)
|
||||||
tables = camelot.read_pdf(filename, pages="2")
|
tables = camelot.read_pdf(filename, pages="2", parallel=parallel)
|
||||||
assert_frame_equal(df, tables[0].df)
|
assert_frame_equal(df, tables[0].df)
|
||||||
|
|
||||||
|
|
||||||
def test_lattice_table_rotated():
|
def test_lattice_table_rotated(parallel):
|
||||||
df = pd.DataFrame(data_lattice_table_rotated)
|
df = pd.DataFrame(data_lattice_table_rotated)
|
||||||
|
|
||||||
filename = os.path.join(testdir, "clockwise_table_1.pdf")
|
filename = os.path.join(testdir, "clockwise_table_1.pdf")
|
||||||
tables = camelot.read_pdf(filename)
|
tables = camelot.read_pdf(filename, parallel=parallel)
|
||||||
assert_frame_equal(df, tables[0].df)
|
assert_frame_equal(df, tables[0].df)
|
||||||
|
|
||||||
filename = os.path.join(testdir, "anticlockwise_table_1.pdf")
|
filename = os.path.join(testdir, "anticlockwise_table_1.pdf")
|
||||||
tables = camelot.read_pdf(filename)
|
tables = camelot.read_pdf(filename, parallel=parallel)
|
||||||
assert_frame_equal(df, tables[0].df)
|
assert_frame_equal(df, tables[0].df)
|
||||||
|
|
||||||
|
|
||||||
def test_lattice_two_tables():
|
def test_lattice_two_tables(parallel):
|
||||||
df1 = pd.DataFrame(data_lattice_two_tables_1)
|
df1 = pd.DataFrame(data_lattice_two_tables_1)
|
||||||
df2 = pd.DataFrame(data_lattice_two_tables_2)
|
df2 = pd.DataFrame(data_lattice_two_tables_2)
|
||||||
|
|
||||||
filename = os.path.join(testdir, "twotables_2.pdf")
|
filename = os.path.join(testdir, "twotables_2.pdf")
|
||||||
tables = camelot.read_pdf(filename)
|
tables = camelot.read_pdf(filename, parallel=parallel)
|
||||||
assert len(tables) == 2
|
assert len(tables) == 2
|
||||||
assert df1.equals(tables[0].df)
|
assert df1.equals(tables[0].df)
|
||||||
assert df2.equals(tables[1].df)
|
assert df2.equals(tables[1].df)
|
||||||
|
|
||||||
|
|
||||||
def test_lattice_table_regions():
|
def test_lattice_table_regions(parallel):
|
||||||
df = pd.DataFrame(data_lattice_table_regions)
|
df = pd.DataFrame(data_lattice_table_regions)
|
||||||
|
|
||||||
filename = os.path.join(testdir, "table_region.pdf")
|
filename = os.path.join(testdir, "table_region.pdf")
|
||||||
tables = camelot.read_pdf(filename, table_regions=["170,370,560,270"])
|
tables = camelot.read_pdf(
|
||||||
|
filename,
|
||||||
|
parallel=parallel,
|
||||||
|
table_regions=["170,370,560,270"]
|
||||||
|
)
|
||||||
assert_frame_equal(df, tables[0].df)
|
assert_frame_equal(df, tables[0].df)
|
||||||
|
|
||||||
|
|
||||||
def test_lattice_table_areas():
|
def test_lattice_table_areas(parallel):
|
||||||
df = pd.DataFrame(data_lattice_table_areas)
|
df = pd.DataFrame(data_lattice_table_areas)
|
||||||
|
|
||||||
filename = os.path.join(testdir, "twotables_2.pdf")
|
filename = os.path.join(testdir, "twotables_2.pdf")
|
||||||
tables = camelot.read_pdf(filename, table_areas=["80,693,535,448"])
|
tables = camelot.read_pdf(
|
||||||
|
filename,
|
||||||
|
parallel=parallel,
|
||||||
|
table_areas=["80,693,535,448"]
|
||||||
|
)
|
||||||
assert_frame_equal(df, tables[0].df)
|
assert_frame_equal(df, tables[0].df)
|
||||||
|
|
||||||
|
|
||||||
def test_lattice_process_background():
|
def test_lattice_process_background(parallel):
|
||||||
df = pd.DataFrame(data_lattice_process_background)
|
df = pd.DataFrame(data_lattice_process_background)
|
||||||
|
|
||||||
filename = os.path.join(testdir, "background_lines_1.pdf")
|
filename = os.path.join(testdir, "background_lines_1.pdf")
|
||||||
tables = camelot.read_pdf(filename, process_background=True)
|
tables = camelot.read_pdf(
|
||||||
|
filename,
|
||||||
|
parallel=parallel,
|
||||||
|
process_background=True,
|
||||||
|
)
|
||||||
assert_frame_equal(df, tables[1].df)
|
assert_frame_equal(df, tables[1].df)
|
||||||
|
|
||||||
|
|
||||||
def test_lattice_copy_text():
|
def test_lattice_copy_text(parallel):
|
||||||
df = pd.DataFrame(data_lattice_copy_text)
|
df = pd.DataFrame(data_lattice_copy_text)
|
||||||
|
|
||||||
filename = os.path.join(testdir, "row_span_1.pdf")
|
filename = os.path.join(testdir, "row_span_1.pdf")
|
||||||
tables = camelot.read_pdf(filename, line_scale=60, copy_text="v")
|
tables = camelot.read_pdf(
|
||||||
|
filename,
|
||||||
|
parallel=parallel,
|
||||||
|
line_scale=60,
|
||||||
|
copy_text="v"
|
||||||
|
)
|
||||||
assert_frame_equal(df, tables[0].df)
|
assert_frame_equal(df, tables[0].df)
|
||||||
|
|
||||||
|
|
||||||
def test_lattice_shift_text():
|
def test_lattice_shift_text(parallel):
|
||||||
df_lt = pd.DataFrame(data_lattice_shift_text_left_top)
|
df_lt = pd.DataFrame(data_lattice_shift_text_left_top)
|
||||||
df_disable = pd.DataFrame(data_lattice_shift_text_disable)
|
df_disable = pd.DataFrame(data_lattice_shift_text_disable)
|
||||||
df_rb = pd.DataFrame(data_lattice_shift_text_right_bottom)
|
df_rb = pd.DataFrame(data_lattice_shift_text_right_bottom)
|
||||||
|
|
||||||
filename = os.path.join(testdir, "column_span_2.pdf")
|
filename = os.path.join(testdir, "column_span_2.pdf")
|
||||||
tables = camelot.read_pdf(filename, line_scale=40)
|
tables = camelot.read_pdf(filename, parallel=parallel, line_scale=40)
|
||||||
assert df_lt.equals(tables[0].df)
|
assert df_lt.equals(tables[0].df)
|
||||||
|
|
||||||
tables = camelot.read_pdf(filename, line_scale=40, shift_text=[""])
|
tables = camelot.read_pdf(
|
||||||
|
filename,
|
||||||
|
parallel=parallel,
|
||||||
|
line_scale=40,
|
||||||
|
shift_text=[""]
|
||||||
|
)
|
||||||
assert df_disable.equals(tables[0].df)
|
assert df_disable.equals(tables[0].df)
|
||||||
|
|
||||||
tables = camelot.read_pdf(filename, line_scale=40, shift_text=["r", "b"])
|
tables = camelot.read_pdf(
|
||||||
|
filename,
|
||||||
|
parallel=parallel,
|
||||||
|
line_scale=40,
|
||||||
|
shift_text=["r", "b"]
|
||||||
|
)
|
||||||
assert df_rb.equals(tables[0].df)
|
assert df_rb.equals(tables[0].df)
|
||||||
|
|
||||||
|
|
||||||
def test_repr():
|
def test_repr(parallel):
|
||||||
filename = os.path.join(testdir, "foo.pdf")
|
filename = os.path.join(testdir, "foo.pdf")
|
||||||
tables = camelot.read_pdf(filename)
|
tables = camelot.read_pdf(filename, parallel=parallel)
|
||||||
assert repr(tables) == "<TableList n=1>"
|
assert repr(tables) == "<TableList n=1>"
|
||||||
assert repr(tables[0]) == "<Table shape=(7, 7)>"
|
assert repr(tables[0]) == "<Table shape=(7, 7)>"
|
||||||
assert (
|
assert (
|
||||||
|
|
@ -234,23 +290,23 @@ def test_repr():
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def test_pages():
|
def test_pages(parallel):
|
||||||
url = "https://camelot-py.readthedocs.io/en/master/_static/pdf/foo.pdf"
|
url = "https://camelot-py.readthedocs.io/en/master/_static/pdf/foo.pdf"
|
||||||
tables = camelot.read_pdf(url)
|
tables = camelot.read_pdf(url, parallel=parallel)
|
||||||
assert repr(tables) == "<TableList n=1>"
|
assert repr(tables) == "<TableList n=1>"
|
||||||
assert repr(tables[0]) == "<Table shape=(7, 7)>"
|
assert repr(tables[0]) == "<Table shape=(7, 7)>"
|
||||||
assert (
|
assert (
|
||||||
repr(tables[0].cells[0][0]) == "<Cell x1=120.48 y1=218.43 x2=164.64 y2=233.77>"
|
repr(tables[0].cells[0][0]) == "<Cell x1=120.48 y1=218.43 x2=164.64 y2=233.77>"
|
||||||
)
|
)
|
||||||
|
|
||||||
tables = camelot.read_pdf(url, pages="1-end")
|
tables = camelot.read_pdf(url, pages="1-end", parallel=parallel)
|
||||||
assert repr(tables) == "<TableList n=1>"
|
assert repr(tables) == "<TableList n=1>"
|
||||||
assert repr(tables[0]) == "<Table shape=(7, 7)>"
|
assert repr(tables[0]) == "<Table shape=(7, 7)>"
|
||||||
assert (
|
assert (
|
||||||
repr(tables[0].cells[0][0]) == "<Cell x1=120.48 y1=218.43 x2=164.64 y2=233.77>"
|
repr(tables[0].cells[0][0]) == "<Cell x1=120.48 y1=218.43 x2=164.64 y2=233.77>"
|
||||||
)
|
)
|
||||||
|
|
||||||
tables = camelot.read_pdf(url, pages="all")
|
tables = camelot.read_pdf(url, pages="all", parallel=parallel)
|
||||||
assert repr(tables) == "<TableList n=1>"
|
assert repr(tables) == "<TableList n=1>"
|
||||||
assert repr(tables[0]) == "<Table shape=(7, 7)>"
|
assert repr(tables[0]) == "<Table shape=(7, 7)>"
|
||||||
assert (
|
assert (
|
||||||
|
|
@ -258,9 +314,9 @@ def test_pages():
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def test_url():
|
def test_url(parallel):
|
||||||
url = "https://camelot-py.readthedocs.io/en/master/_static/pdf/foo.pdf"
|
url = "https://camelot-py.readthedocs.io/en/master/_static/pdf/foo.pdf"
|
||||||
tables = camelot.read_pdf(url)
|
tables = camelot.read_pdf(url, parallel=parallel)
|
||||||
assert repr(tables) == "<TableList n=1>"
|
assert repr(tables) == "<TableList n=1>"
|
||||||
assert repr(tables[0]) == "<Table shape=(7, 7)>"
|
assert repr(tables[0]) == "<Table shape=(7, 7)>"
|
||||||
assert (
|
assert (
|
||||||
|
|
@ -268,11 +324,11 @@ def test_url():
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def test_arabic():
|
def test_arabic(parallel):
|
||||||
df = pd.DataFrame(data_arabic)
|
df = pd.DataFrame(data_arabic)
|
||||||
|
|
||||||
filename = os.path.join(testdir, "tabula/arabic.pdf")
|
filename = os.path.join(testdir, "tabula/arabic.pdf")
|
||||||
tables = camelot.read_pdf(filename)
|
tables = camelot.read_pdf(filename, parallel=parallel)
|
||||||
assert_frame_equal(df, tables[0].df)
|
assert_frame_equal(df, tables[0].df)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue