diff --git a/docs/_static/png/edge_tol_1.png b/docs/_static/png/edge_tol_1.png new file mode 100644 index 0000000..f7f7a67 Binary files /dev/null and b/docs/_static/png/edge_tol_1.png differ diff --git a/docs/_static/png/edge_tol_2.png b/docs/_static/png/edge_tol_2.png new file mode 100644 index 0000000..a5ec743 Binary files /dev/null and b/docs/_static/png/edge_tol_2.png differ diff --git a/docs/user/advanced.rst b/docs/user/advanced.rst index 31d9bd3..ca40bb8 100644 --- a/docs/user/advanced.rst +++ b/docs/user/advanced.rst @@ -316,8 +316,85 @@ You can solve this by passing ``flag_size=True``, which will enclose the supersc "Madhya Pradesh","27.13","23.57","-","-","3.56","0.38","-","1.86","-","1.28" "...","...","...","...","...","...","...","...","...","...","..." -Control how text is grouped into rows -------------------------------------- +Strip characters from text +-------------------------- + +You can strip unwanted characters like spaces, dots and newlines from a string using the ``strip_text`` keyword argument. Take a look at `this PDF `_ as an example, the text at the start of each row contains a lot of unwanted spaces, dots and newlines. + +:: + + >>> tables = camelot.read_pdf('12s0324.pdf', flavor='stream', strip_text=' .\n') + >>> tables[0].df + +.. tip:: + Here's how you can do the same with the :ref:`command-line interface `. + :: + + $ camelot -strip ' .\n' stream 12s0324.pdf + +.. csv-table:: + + "...","...","...","...","...","...","...","...","...","..." + "Forcible rape","17.5","2.6","14.9","17.2","2.5","14.7","–","–","–" + "Robbery","102.1","25.5","76.6","90.0","22.9","67.1","12.1","2.5","9.5" + "Aggravated assault","338.4","40.1","298.3","264.0","30.2","233.8","74.4","9.9","64.5" + "Property crime","1,396 .4","338 .7","1,057 .7","875 .9","210 .8","665 .1","608 .2","127 .9","392 .6" + "Burglary","240.9","60.3","180.6","205.0","53.4","151.7","35.9","6.9","29.0" + "...","...","...","...","...","...","...","...","...","..." + +Improve guessed table areas +--------------------------- + +While using :ref:`Stream `, automatic table detection can fail for PDFs like `this one `_. That's because the text is relatively far apart vertically, which can lead to shorter textedges being calculated. + +.. note:: To know more about how textedges are calculated to guess table areas, you can see pages 20, 35 and 40 of `Anssi Nurminen's master's thesis `_. + +Let's see the table area that is detected by default. + +:: + + >>> tables = camelot.read_pdf('edge_tol.pdf', flavor='stream') + >>> camelot.plot(tables[0], kind='contour') + >>> plt.show() + +.. tip:: + Here's how you can do the same with the :ref:`command-line interface `. + :: + + $ camelot stream -plot contour edge.pdf + +.. figure:: ../_static/png/edge_tol_1.png + :height: 674 + :width: 1366 + :scale: 50% + :alt: Table area with default edge_tol + :align: left + +To improve the detected area, you can increase the ``edge_tol`` (default: 50) value to counter the effect of text being placed relatively far apart vertically. Larger ``edge_tol`` will lead to longer textedges being detected, leading to an improved guess of the table area. Let's use a value of 500. + +:: + + >>> tables = camelot.read_pdf('edge_tol.pdf', flavor='stream', edge_tol=500) + >>> camelot.plot(tables[0], kind='contour') + >>> plt.show() + +.. tip:: + Here's how you can do the same with the :ref:`command-line interface `. + :: + + $ camelot stream -e 500 -plot contour edge.pdf + +.. figure:: ../_static/png/edge_tol_2.png + :height: 674 + :width: 1366 + :scale: 50% + :alt: Table area with default edge_tol + :align: left + +As you can see, the guessed table area has improved! + +Improve guessed table rows +-------------------------- You can pass ``row_tol=<+int>`` to group the rows closer together, as shown below. diff --git a/tests/files/edge_tolerance.pdf b/tests/files/edge_tol.pdf similarity index 100% rename from tests/files/edge_tolerance.pdf rename to tests/files/edge_tol.pdf diff --git a/tests/test_common.py b/tests/test_common.py index 0289655..83c436b 100644 --- a/tests/test_common.py +++ b/tests/test_common.py @@ -113,7 +113,7 @@ def test_stream_strip_text(): def test_stream_edge_tol(): df = pd.DataFrame(data_stream_edge_tol) - filename = os.path.join(testdir, "edge_tolerance.pdf") + filename = os.path.join(testdir, "edge_tol.pdf") tables = camelot.read_pdf(filename, flavor="stream", edge_tol=500) assert df.equals(tables[0].df)