Fixed strip_text argument getting ignored

pull/4/head
Dimiter Naydenov 2019-07-04 12:12:52 +03:00
parent d5df93635e
commit 240ea6c411
3 changed files with 2587 additions and 284 deletions

View File

@ -1,6 +1,7 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
from __future__ import division from __future__ import division
import re
import os import os
import sys import sys
import random import random
@ -405,6 +406,27 @@ def merge_close_lines(ar, line_tol=2):
return ret return ret
def text_strip(text, strip=""):
"""Strips any characters in `strip` that are present in `text`.
Parameters
----------
text : str
Text to process and strip.
strip : str, optional (default: '')
Characters that should be stripped from `text`.
Returns
-------
stripped : str
"""
if not strip:
return text
stripped = re.sub(
r"[{}]".format("".join(map(re.escape, strip))), "", text, re.UNICODE
)
return stripped
# TODO: combine the following functions into a TextProcessor class which # TODO: combine the following functions into a TextProcessor class which
# applies corresponding transformations sequentially # applies corresponding transformations sequentially
# (inspired from sklearn.pipeline.Pipeline) # (inspired from sklearn.pipeline.Pipeline)
@ -456,10 +478,10 @@ def flag_font_size(textline, direction, strip_text=""):
fchars = [t[0] for t in chars] fchars = [t[0] for t in chars]
if "".join(fchars).strip(): if "".join(fchars).strip():
flist.append("".join(fchars)) flist.append("".join(fchars))
fstring = "".join(flist).strip(strip_text) fstring = "".join(flist)
else: else:
fstring = "".join([t.get_text() for t in textline]).strip(strip_text) fstring = "".join([t.get_text() for t in textline])
return fstring return text_strip(fstring, strip_text)
def split_textline(table, textline, direction, flag_size=False, strip_text=""): def split_textline(table, textline, direction, flag_size=False, strip_text=""):
@ -574,7 +596,9 @@ def split_textline(table, textline, direction, flag_size=False, strip_text=""):
) )
else: else:
gchars = [t[2].get_text() for t in chars] gchars = [t[2].get_text() for t in chars]
grouped_chars.append((key[0], key[1], "".join(gchars).strip(strip_text))) grouped_chars.append(
(key[0], key[1], text_strip("".join(gchars), strip_text))
)
return grouped_chars return grouped_chars
@ -678,7 +702,7 @@ def get_table_index(
error, error,
) )
else: else:
return [(r_idx, c_idx, t.get_text().strip(strip_text))], error return [(r_idx, c_idx, text_strip(t.get_text(), strip_text))], error
def compute_accuracy(error_weights): def compute_accuracy(error_weights):

File diff suppressed because it is too large Load Diff

View File

@ -14,12 +14,7 @@ testdir = os.path.join(testdir, "files")
def test_parsing_report(): def test_parsing_report():
parsing_report = { parsing_report = {"accuracy": 99.02, "whitespace": 12.24, "order": 1, "page": 1}
'accuracy': 99.02,
'whitespace': 12.24,
'order': 1,
'page': 1
}
filename = os.path.join(testdir, "foo.pdf") filename = os.path.join(testdir, "foo.pdf")
tables = camelot.read_pdf(filename) tables = camelot.read_pdf(filename)
@ -62,7 +57,7 @@ def test_stream_two_tables():
df2 = pd.DataFrame(data_stream_two_tables_2) df2 = pd.DataFrame(data_stream_two_tables_2)
filename = os.path.join(testdir, "tabula/12s0324.pdf") filename = os.path.join(testdir, "tabula/12s0324.pdf")
tables = camelot.read_pdf(filename, flavor='stream') tables = camelot.read_pdf(filename, flavor="stream")
assert len(tables) == 2 assert len(tables) == 2
assert df1.equals(tables[0].df) assert df1.equals(tables[0].df)
@ -73,7 +68,9 @@ def test_stream_table_regions():
df = pd.DataFrame(data_stream_table_areas) df = pd.DataFrame(data_stream_table_areas)
filename = os.path.join(testdir, "tabula/us-007.pdf") filename = os.path.join(testdir, "tabula/us-007.pdf")
tables = camelot.read_pdf(filename, flavor="stream", table_regions=["320,460,573,335"]) tables = camelot.read_pdf(
filename, flavor="stream", table_regions=["320,460,573,335"]
)
assert df.equals(tables[0].df) assert df.equals(tables[0].df)
@ -81,7 +78,9 @@ def test_stream_table_areas():
df = pd.DataFrame(data_stream_table_areas) df = pd.DataFrame(data_stream_table_areas)
filename = os.path.join(testdir, "tabula/us-007.pdf") filename = os.path.join(testdir, "tabula/us-007.pdf")
tables = camelot.read_pdf(filename, flavor="stream", table_areas=["320,500,573,335"]) tables = camelot.read_pdf(
filename, flavor="stream", table_areas=["320,500,573,335"]
)
assert df.equals(tables[0].df) assert df.equals(tables[0].df)
@ -90,7 +89,8 @@ def test_stream_columns():
filename = os.path.join(testdir, "mexican_towns.pdf") filename = os.path.join(testdir, "mexican_towns.pdf")
tables = camelot.read_pdf( tables = camelot.read_pdf(
filename, flavor="stream", columns=["67,180,230,425,475"], row_tol=10) filename, flavor="stream", columns=["67,180,230,425,475"], row_tol=10
)
assert df.equals(tables[0].df) assert df.equals(tables[0].df)
@ -99,7 +99,11 @@ def test_stream_split_text():
filename = os.path.join(testdir, "tabula/m27.pdf") filename = os.path.join(testdir, "tabula/m27.pdf")
tables = camelot.read_pdf( tables = camelot.read_pdf(
filename, flavor="stream", columns=["72,95,209,327,442,529,566,606,683"], split_text=True) filename,
flavor="stream",
columns=["72,95,209,327,442,529,566,606,683"],
split_text=True,
)
assert df.equals(tables[0].df) assert df.equals(tables[0].df)
@ -115,7 +119,7 @@ def test_stream_strip_text():
df = pd.DataFrame(data_stream_strip_text) df = pd.DataFrame(data_stream_strip_text)
filename = os.path.join(testdir, "detect_vertical_false.pdf") filename = os.path.join(testdir, "detect_vertical_false.pdf")
tables = camelot.read_pdf(filename, flavor="stream", strip_text="\n") tables = camelot.read_pdf(filename, flavor="stream", strip_text=" ,\n")
assert df.equals(tables[0].df) assert df.equals(tables[0].df)
@ -132,7 +136,8 @@ def test_stream_layout_kwargs():
filename = os.path.join(testdir, "detect_vertical_false.pdf") filename = os.path.join(testdir, "detect_vertical_false.pdf")
tables = camelot.read_pdf( tables = camelot.read_pdf(
filename, flavor="stream", layout_kwargs={"detect_vertical": False}) filename, flavor="stream", layout_kwargs={"detect_vertical": False}
)
assert df.equals(tables[0].df) assert df.equals(tables[0].df)
@ -140,7 +145,8 @@ def test_lattice():
df = pd.DataFrame(data_lattice) df = pd.DataFrame(data_lattice)
filename = os.path.join( filename = os.path.join(
testdir, "tabula/icdar2013-dataset/competition-dataset-us/us-030.pdf") testdir, "tabula/icdar2013-dataset/competition-dataset-us/us-030.pdf"
)
tables = camelot.read_pdf(filename, pages="2") tables = camelot.read_pdf(filename, pages="2")
assert df.equals(tables[0].df) assert df.equals(tables[0].df)
@ -209,10 +215,10 @@ def test_lattice_shift_text():
tables = camelot.read_pdf(filename, line_scale=40) tables = camelot.read_pdf(filename, line_scale=40)
assert df_lt.equals(tables[0].df) assert df_lt.equals(tables[0].df)
tables = camelot.read_pdf(filename, line_scale=40, shift_text=['']) tables = camelot.read_pdf(filename, line_scale=40, shift_text=[""])
assert df_disable.equals(tables[0].df) assert df_disable.equals(tables[0].df)
tables = camelot.read_pdf(filename, line_scale=40, shift_text=['r', 'b']) tables = camelot.read_pdf(filename, line_scale=40, shift_text=["r", "b"])
assert df_rb.equals(tables[0].df) assert df_rb.equals(tables[0].df)
@ -221,7 +227,9 @@ def test_repr():
tables = camelot.read_pdf(filename) tables = camelot.read_pdf(filename)
assert repr(tables) == "<TableList n=1>" assert repr(tables) == "<TableList n=1>"
assert repr(tables[0]) == "<Table shape=(7, 7)>" assert repr(tables[0]) == "<Table shape=(7, 7)>"
assert repr(tables[0].cells[0][0]) == "<Cell x1=120.48 y1=218.43 x2=164.64 y2=233.77>" assert (
repr(tables[0].cells[0][0]) == "<Cell x1=120.48 y1=218.43 x2=164.64 y2=233.77>"
)
def test_pages(): def test_pages():
@ -229,17 +237,23 @@ def test_pages():
tables = camelot.read_pdf(url) tables = camelot.read_pdf(url)
assert repr(tables) == "<TableList n=1>" assert repr(tables) == "<TableList n=1>"
assert repr(tables[0]) == "<Table shape=(7, 7)>" assert repr(tables[0]) == "<Table shape=(7, 7)>"
assert repr(tables[0].cells[0][0]) == "<Cell x1=120.48 y1=218.43 x2=164.64 y2=233.77>" assert (
repr(tables[0].cells[0][0]) == "<Cell x1=120.48 y1=218.43 x2=164.64 y2=233.77>"
)
tables = camelot.read_pdf(url, pages='1-end') tables = camelot.read_pdf(url, pages="1-end")
assert repr(tables) == "<TableList n=1>" assert repr(tables) == "<TableList n=1>"
assert repr(tables[0]) == "<Table shape=(7, 7)>" assert repr(tables[0]) == "<Table shape=(7, 7)>"
assert repr(tables[0].cells[0][0]) == "<Cell x1=120.48 y1=218.43 x2=164.64 y2=233.77>" assert (
repr(tables[0].cells[0][0]) == "<Cell x1=120.48 y1=218.43 x2=164.64 y2=233.77>"
)
tables = camelot.read_pdf(url, pages='all') tables = camelot.read_pdf(url, pages="all")
assert repr(tables) == "<TableList n=1>" assert repr(tables) == "<TableList n=1>"
assert repr(tables[0]) == "<Table shape=(7, 7)>" assert repr(tables[0]) == "<Table shape=(7, 7)>"
assert repr(tables[0].cells[0][0]) == "<Cell x1=120.48 y1=218.43 x2=164.64 y2=233.77>" assert (
repr(tables[0].cells[0][0]) == "<Cell x1=120.48 y1=218.43 x2=164.64 y2=233.77>"
)
def test_url(): def test_url():
@ -247,7 +261,9 @@ def test_url():
tables = camelot.read_pdf(url) tables = camelot.read_pdf(url)
assert repr(tables) == "<TableList n=1>" assert repr(tables) == "<TableList n=1>"
assert repr(tables[0]) == "<Table shape=(7, 7)>" assert repr(tables[0]) == "<Table shape=(7, 7)>"
assert repr(tables[0].cells[0][0]) == "<Cell x1=120.48 y1=218.43 x2=164.64 y2=233.77>" assert (
repr(tables[0].cells[0][0]) == "<Cell x1=120.48 y1=218.43 x2=164.64 y2=233.77>"
)
def test_arabic(): def test_arabic():