Fixed strip_text argument getting ignored

2019-07-04 12:12:52 +03:00 · 2019-07-04 12:12:52 +03:00 · 240ea6c411
parent d5df93635e
commit 240ea6c411
3 changed files with 2587 additions and 284 deletions
--- a/camelot/utils.py
+++ b/camelot/utils.py
@ -1,6 +1,7 @@
 # -*- coding: utf-8 -*-
 from __future__ import division
 import re
 import os
 import sys
 import random
@ -405,6 +406,27 @@ def merge_close_lines(ar, line_tol=2):
    return ret
 def text_strip(text, strip=""):
    """Strips any characters in `strip` that are present in `text`.
    Parameters
    ----------
    text : str
        Text to process and strip.
    strip : str, optional (default: '')
        Characters that should be stripped from `text`.
    Returns
    -------
    stripped : str
    """
    if not strip:
        return text
    stripped = re.sub(
        r"[{}]".format("".join(map(re.escape, strip))), "", text, re.UNICODE
    )
    return stripped
 # TODO: combine the following functions into a TextProcessor class which
 # applies corresponding transformations sequentially
 # (inspired from sklearn.pipeline.Pipeline)
@ -456,10 +478,10 @@ def flag_font_size(textline, direction, strip_text=""):
                fchars = [t[0] for t in chars]
                if "".join(fchars).strip():
                    flist.append("".join(fchars))
-        fstring = "".join(flist).strip(strip_text)
+        fstring = "".join(flist)
    else:
-        fstring = "".join([t.get_text() for t in textline]).strip(strip_text)
+        fstring = "".join([t.get_text() for t in textline])
-    return fstring
+    return text_strip(fstring, strip_text)
 def split_textline(table, textline, direction, flag_size=False, strip_text=""):
@ -574,7 +596,9 @@ def split_textline(table, textline, direction, flag_size=False, strip_text=""):
            )
        else:
            gchars = [t[2].get_text() for t in chars]
-            grouped_chars.append((key[0], key[1], "".join(gchars).strip(strip_text)))
+            grouped_chars.append(
                (key[0], key[1], text_strip("".join(gchars), strip_text))
            )
    return grouped_chars
@ -678,7 +702,7 @@ def get_table_index(
                error,
            )
        else:
-            return [(r_idx, c_idx, t.get_text().strip(strip_text))], error
+            return [(r_idx, c_idx, text_strip(t.get_text(), strip_text))], error
 def compute_accuracy(error_weights):
--- a/tests/data.py
+++ b/tests/data.py
--- a/tests/test_common.py
+++ b/tests/test_common.py
@ -14,12 +14,7 @@ testdir = os.path.join(testdir, "files")
 def test_parsing_report():
-    parsing_report = {
+    parsing_report = {"accuracy": 99.02, "whitespace": 12.24, "order": 1, "page": 1}
        'accuracy': 99.02,
        'whitespace': 12.24,
        'order': 1,
        'page': 1
    }
    filename = os.path.join(testdir, "foo.pdf")
    tables = camelot.read_pdf(filename)
@ -62,7 +57,7 @@ def test_stream_two_tables():
    df2 = pd.DataFrame(data_stream_two_tables_2)
    filename = os.path.join(testdir, "tabula/12s0324.pdf")
-    tables = camelot.read_pdf(filename, flavor='stream')
+    tables = camelot.read_pdf(filename, flavor="stream")
    assert len(tables) == 2
    assert df1.equals(tables[0].df)
@ -73,7 +68,9 @@ def test_stream_table_regions():
    df = pd.DataFrame(data_stream_table_areas)
    filename = os.path.join(testdir, "tabula/us-007.pdf")
-    tables = camelot.read_pdf(filename, flavor="stream", table_regions=["320,460,573,335"])
+    tables = camelot.read_pdf(
        filename, flavor="stream", table_regions=["320,460,573,335"]
    )
    assert df.equals(tables[0].df)
@ -81,7 +78,9 @@ def test_stream_table_areas():
    df = pd.DataFrame(data_stream_table_areas)
    filename = os.path.join(testdir, "tabula/us-007.pdf")
-    tables = camelot.read_pdf(filename, flavor="stream", table_areas=["320,500,573,335"])
+    tables = camelot.read_pdf(
        filename, flavor="stream", table_areas=["320,500,573,335"]
    )
    assert df.equals(tables[0].df)
@ -90,7 +89,8 @@ def test_stream_columns():
    filename = os.path.join(testdir, "mexican_towns.pdf")
    tables = camelot.read_pdf(
-        filename, flavor="stream", columns=["67,180,230,425,475"], row_tol=10)
+        filename, flavor="stream", columns=["67,180,230,425,475"], row_tol=10
    )
    assert df.equals(tables[0].df)
@ -99,7 +99,11 @@ def test_stream_split_text():
    filename = os.path.join(testdir, "tabula/m27.pdf")
    tables = camelot.read_pdf(
-        filename, flavor="stream", columns=["72,95,209,327,442,529,566,606,683"], split_text=True)
+        filename,
        flavor="stream",
        columns=["72,95,209,327,442,529,566,606,683"],
        split_text=True,
    )
    assert df.equals(tables[0].df)
@ -115,7 +119,7 @@ def test_stream_strip_text():
    df = pd.DataFrame(data_stream_strip_text)
    filename = os.path.join(testdir, "detect_vertical_false.pdf")
-    tables = camelot.read_pdf(filename, flavor="stream", strip_text="\n")
+    tables = camelot.read_pdf(filename, flavor="stream", strip_text=" ,\n")
    assert df.equals(tables[0].df)
@ -132,7 +136,8 @@ def test_stream_layout_kwargs():
    filename = os.path.join(testdir, "detect_vertical_false.pdf")
    tables = camelot.read_pdf(
-        filename, flavor="stream", layout_kwargs={"detect_vertical": False})
+        filename, flavor="stream", layout_kwargs={"detect_vertical": False}
    )
    assert df.equals(tables[0].df)
@ -140,7 +145,8 @@ def test_lattice():
    df = pd.DataFrame(data_lattice)
    filename = os.path.join(
-        testdir, "tabula/icdar2013-dataset/competition-dataset-us/us-030.pdf")
+        testdir, "tabula/icdar2013-dataset/competition-dataset-us/us-030.pdf"
    )
    tables = camelot.read_pdf(filename, pages="2")
    assert df.equals(tables[0].df)
@ -209,10 +215,10 @@ def test_lattice_shift_text():
    tables = camelot.read_pdf(filename, line_scale=40)
    assert df_lt.equals(tables[0].df)
-    tables = camelot.read_pdf(filename, line_scale=40, shift_text=[''])
+    tables = camelot.read_pdf(filename, line_scale=40, shift_text=[""])
    assert df_disable.equals(tables[0].df)
-    tables = camelot.read_pdf(filename, line_scale=40, shift_text=['r', 'b'])
+    tables = camelot.read_pdf(filename, line_scale=40, shift_text=["r", "b"])
    assert df_rb.equals(tables[0].df)
@ -221,7 +227,9 @@ def test_repr():
    tables = camelot.read_pdf(filename)
    assert repr(tables) == "<TableList n=1>"
    assert repr(tables[0]) == "<Table shape=(7, 7)>"
-    assert repr(tables[0].cells[0][0]) == "<Cell x1=120.48 y1=218.43 x2=164.64 y2=233.77>"
+    assert (
        repr(tables[0].cells[0][0]) == "<Cell x1=120.48 y1=218.43 x2=164.64 y2=233.77>"
    )
 def test_pages():
@ -229,17 +237,23 @@ def test_pages():
    tables = camelot.read_pdf(url)
    assert repr(tables) == "<TableList n=1>"
    assert repr(tables[0]) == "<Table shape=(7, 7)>"
-    assert repr(tables[0].cells[0][0]) == "<Cell x1=120.48 y1=218.43 x2=164.64 y2=233.77>"
+    assert (
        repr(tables[0].cells[0][0]) == "<Cell x1=120.48 y1=218.43 x2=164.64 y2=233.77>"
    )
-    tables = camelot.read_pdf(url, pages='1-end')
+    tables = camelot.read_pdf(url, pages="1-end")
    assert repr(tables) == "<TableList n=1>"
    assert repr(tables[0]) == "<Table shape=(7, 7)>"
-    assert repr(tables[0].cells[0][0]) == "<Cell x1=120.48 y1=218.43 x2=164.64 y2=233.77>"
+    assert (
        repr(tables[0].cells[0][0]) == "<Cell x1=120.48 y1=218.43 x2=164.64 y2=233.77>"
    )
-    tables = camelot.read_pdf(url, pages='all')
+    tables = camelot.read_pdf(url, pages="all")
    assert repr(tables) == "<TableList n=1>"
    assert repr(tables[0]) == "<Table shape=(7, 7)>"
-    assert repr(tables[0].cells[0][0]) == "<Cell x1=120.48 y1=218.43 x2=164.64 y2=233.77>"
+    assert (
        repr(tables[0].cells[0][0]) == "<Cell x1=120.48 y1=218.43 x2=164.64 y2=233.77>"
    )
 def test_url():
@ -247,7 +261,9 @@ def test_url():
    tables = camelot.read_pdf(url)
    assert repr(tables) == "<TableList n=1>"
    assert repr(tables[0]) == "<Table shape=(7, 7)>"
-    assert repr(tables[0].cells[0][0]) == "<Cell x1=120.48 y1=218.43 x2=164.64 y2=233.77>"
+    assert (
        repr(tables[0].cells[0][0]) == "<Cell x1=120.48 y1=218.43 x2=164.64 y2=233.77>"
    )
 def test_arabic():