Merge pull request #4 from camelot-dev/fix-strip-text-arg

[MRG] Fixed strip_text argument getting ignored
2019-07-04 18:26:11 +03:00 · 2019-07-04 18:26:11 +03:00 · e81e818b0e
parent d5df93635e 240ea6c411
commit e81e818b0e
3 changed files with 2587 additions and 284 deletions
--- a/camelot/utils.py
+++ b/camelot/utils.py
@ -1,6 +1,7 @@
 # -*- coding: utf-8 -*-
 from __future__ import division

+import re
 import os
 import sys
 import random
@ -405,6 +406,27 @@ def merge_close_lines(ar, line_tol=2):
    return ret


+def text_strip(text, strip=""):
+    """Strips any characters in `strip` that are present in `text`.
+    Parameters
+    ----------
+    text : str
+        Text to process and strip.
+    strip : str, optional (default: '')
+        Characters that should be stripped from `text`.
+    Returns
+    -------
+    stripped : str
+    """
+    if not strip:
+        return text
+
+    stripped = re.sub(
+        r"[{}]".format("".join(map(re.escape, strip))), "", text, re.UNICODE
+    )
+    return stripped
+
+
 # TODO: combine the following functions into a TextProcessor class which
 # applies corresponding transformations sequentially
 # (inspired from sklearn.pipeline.Pipeline)
@ -456,10 +478,10 @@ def flag_font_size(textline, direction, strip_text=""):
                fchars = [t[0] for t in chars]
                if "".join(fchars).strip():
                    flist.append("".join(fchars))
-        fstring = "".join(flist).strip(strip_text)
+        fstring = "".join(flist)
    else:
-        fstring = "".join([t.get_text() for t in textline]).strip(strip_text)
-    return fstring
+        fstring = "".join([t.get_text() for t in textline])
+    return text_strip(fstring, strip_text)


 def split_textline(table, textline, direction, flag_size=False, strip_text=""):
@ -574,7 +596,9 @@ def split_textline(table, textline, direction, flag_size=False, strip_text=""):
            )
        else:
            gchars = [t[2].get_text() for t in chars]
-            grouped_chars.append((key[0], key[1], "".join(gchars).strip(strip_text)))
+            grouped_chars.append(
+                (key[0], key[1], text_strip("".join(gchars), strip_text))
+            )
    return grouped_chars


@ -678,7 +702,7 @@ def get_table_index(
                error,
            )
        else:
-            return [(r_idx, c_idx, t.get_text().strip(strip_text))], error
+            return [(r_idx, c_idx, text_strip(t.get_text(), strip_text))], error


 def compute_accuracy(error_weights):
--- a/tests/data.py
+++ b/tests/data.py
--- a/tests/test_common.py
+++ b/tests/test_common.py
@ -14,12 +14,7 @@ testdir = os.path.join(testdir, "files")


 def test_parsing_report():
-    parsing_report = {
-        'accuracy': 99.02,
-        'whitespace': 12.24,
-        'order': 1,
-        'page': 1
-    }
+    parsing_report = {"accuracy": 99.02, "whitespace": 12.24, "order": 1, "page": 1}

    filename = os.path.join(testdir, "foo.pdf")
    tables = camelot.read_pdf(filename)
@ -62,7 +57,7 @@ def test_stream_two_tables():
    df2 = pd.DataFrame(data_stream_two_tables_2)

    filename = os.path.join(testdir, "tabula/12s0324.pdf")
-    tables = camelot.read_pdf(filename, flavor='stream')
+    tables = camelot.read_pdf(filename, flavor="stream")

    assert len(tables) == 2
    assert df1.equals(tables[0].df)
@ -73,7 +68,9 @@ def test_stream_table_regions():
    df = pd.DataFrame(data_stream_table_areas)

    filename = os.path.join(testdir, "tabula/us-007.pdf")
-    tables = camelot.read_pdf(filename, flavor="stream", table_regions=["320,460,573,335"])
+    tables = camelot.read_pdf(
+        filename, flavor="stream", table_regions=["320,460,573,335"]
+    )
    assert df.equals(tables[0].df)


@ -81,7 +78,9 @@ def test_stream_table_areas():
    df = pd.DataFrame(data_stream_table_areas)

    filename = os.path.join(testdir, "tabula/us-007.pdf")
-    tables = camelot.read_pdf(filename, flavor="stream", table_areas=["320,500,573,335"])
+    tables = camelot.read_pdf(
+        filename, flavor="stream", table_areas=["320,500,573,335"]
+    )
    assert df.equals(tables[0].df)


@ -90,7 +89,8 @@ def test_stream_columns():

    filename = os.path.join(testdir, "mexican_towns.pdf")
    tables = camelot.read_pdf(
-        filename, flavor="stream", columns=["67,180,230,425,475"], row_tol=10)
+        filename, flavor="stream", columns=["67,180,230,425,475"], row_tol=10
+    )
    assert df.equals(tables[0].df)


@ -99,7 +99,11 @@ def test_stream_split_text():

    filename = os.path.join(testdir, "tabula/m27.pdf")
    tables = camelot.read_pdf(
-        filename, flavor="stream", columns=["72,95,209,327,442,529,566,606,683"], split_text=True)
+        filename,
+        flavor="stream",
+        columns=["72,95,209,327,442,529,566,606,683"],
+        split_text=True,
+    )
    assert df.equals(tables[0].df)


@ -115,7 +119,7 @@ def test_stream_strip_text():
    df = pd.DataFrame(data_stream_strip_text)

    filename = os.path.join(testdir, "detect_vertical_false.pdf")
-    tables = camelot.read_pdf(filename, flavor="stream", strip_text="\n")
+    tables = camelot.read_pdf(filename, flavor="stream", strip_text=" ,\n")
    assert df.equals(tables[0].df)


@ -132,7 +136,8 @@ def test_stream_layout_kwargs():

    filename = os.path.join(testdir, "detect_vertical_false.pdf")
    tables = camelot.read_pdf(
-        filename, flavor="stream", layout_kwargs={"detect_vertical": False})
+        filename, flavor="stream", layout_kwargs={"detect_vertical": False}
+    )
    assert df.equals(tables[0].df)


@ -140,7 +145,8 @@ def test_lattice():
    df = pd.DataFrame(data_lattice)

    filename = os.path.join(
-        testdir, "tabula/icdar2013-dataset/competition-dataset-us/us-030.pdf")
+        testdir, "tabula/icdar2013-dataset/competition-dataset-us/us-030.pdf"
+    )
    tables = camelot.read_pdf(filename, pages="2")
    assert df.equals(tables[0].df)

@ -209,10 +215,10 @@ def test_lattice_shift_text():
    tables = camelot.read_pdf(filename, line_scale=40)
    assert df_lt.equals(tables[0].df)

-    tables = camelot.read_pdf(filename, line_scale=40, shift_text=[''])
+    tables = camelot.read_pdf(filename, line_scale=40, shift_text=[""])
    assert df_disable.equals(tables[0].df)

-    tables = camelot.read_pdf(filename, line_scale=40, shift_text=['r', 'b'])
+    tables = camelot.read_pdf(filename, line_scale=40, shift_text=["r", "b"])
    assert df_rb.equals(tables[0].df)


@ -221,7 +227,9 @@ def test_repr():
    tables = camelot.read_pdf(filename)
    assert repr(tables) == "<TableList n=1>"
    assert repr(tables[0]) == "<Table shape=(7, 7)>"
-    assert repr(tables[0].cells[0][0]) == "<Cell x1=120.48 y1=218.43 x2=164.64 y2=233.77>"
+    assert (
+        repr(tables[0].cells[0][0]) == "<Cell x1=120.48 y1=218.43 x2=164.64 y2=233.77>"
+    )


 def test_pages():
@ -229,17 +237,23 @@ def test_pages():
    tables = camelot.read_pdf(url)
    assert repr(tables) == "<TableList n=1>"
    assert repr(tables[0]) == "<Table shape=(7, 7)>"
-    assert repr(tables[0].cells[0][0]) == "<Cell x1=120.48 y1=218.43 x2=164.64 y2=233.77>"
+    assert (
+        repr(tables[0].cells[0][0]) == "<Cell x1=120.48 y1=218.43 x2=164.64 y2=233.77>"
+    )

-    tables = camelot.read_pdf(url, pages='1-end')
+    tables = camelot.read_pdf(url, pages="1-end")
    assert repr(tables) == "<TableList n=1>"
    assert repr(tables[0]) == "<Table shape=(7, 7)>"
-    assert repr(tables[0].cells[0][0]) == "<Cell x1=120.48 y1=218.43 x2=164.64 y2=233.77>"
+    assert (
+        repr(tables[0].cells[0][0]) == "<Cell x1=120.48 y1=218.43 x2=164.64 y2=233.77>"
+    )

-    tables = camelot.read_pdf(url, pages='all')
+    tables = camelot.read_pdf(url, pages="all")
    assert repr(tables) == "<TableList n=1>"
    assert repr(tables[0]) == "<Table shape=(7, 7)>"
-    assert repr(tables[0].cells[0][0]) == "<Cell x1=120.48 y1=218.43 x2=164.64 y2=233.77>"
+    assert (
+        repr(tables[0].cells[0][0]) == "<Cell x1=120.48 y1=218.43 x2=164.64 y2=233.77>"
+    )


 def test_url():
@ -247,7 +261,9 @@ def test_url():
    tables = camelot.read_pdf(url)
    assert repr(tables) == "<TableList n=1>"
    assert repr(tables[0]) == "<Table shape=(7, 7)>"
-    assert repr(tables[0].cells[0][0]) == "<Cell x1=120.48 y1=218.43 x2=164.64 y2=233.77>"
+    assert (
+        repr(tables[0].cells[0][0]) == "<Cell x1=120.48 y1=218.43 x2=164.64 y2=233.77>"
+    )


 def test_arabic():