{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Parser comparison\n", "\n", "This notebook lets you visualize side-by-side how each parser analyzes a document, and compare the resulting tables.\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "tags": [] }, "outputs": [], "source": [ "# Bootstrap and common imports\n", "import os, sys, time\n", "sys.path.insert(0, os.path.abspath('')) # Prefer the local version of camelot if available\n", "import camelot\n", "\n", "print(f\"Using Camelot v{camelot.__version__} from file {camelot.__file__}.\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Select a PDF file to review\n", "\n", "This is seeded with the unit test files for convenience." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "kwargs = {}\n", "data = None\n", "# pdf_file, kwargs, data = \"superscript.pdf\", {\"flag_size\": True}, data_stream_flag_size # test_hybrid_flag_size\n", "# pdf_file = \"health.pdf\" # test_hybrid\n", "# pdf_file = \"clockwise_table_2.pdf\"\n", "\n", "# pdf_file = \"tabula/12s0324.pdf\" # interesting because contains two separate tables\n", "\n", "# pdf_file = \"clockwise_table_2.pdf\" # test_hybrid_table_rotated / test_stream_table_rotated\n", "# pdf_file, kwargs = \"tabula/us-007.pdf\", {\"table_regions\": [\"320,335,573,505\"]} # test_hybrid_table_regions\n", "# pdf_file, kwargs = \"detect_vertical_false.pdf\", {\"strip_text\": \" ,\\n\"} # data_stream_strip_text\n", "# pdf_file, kwargs, data = \"tabula/m27.pdf\", {\"columns\": [\"72,95,209,327,442,529,566,606,683\"], \"split_text\": True, }, data_stream_split_text # data_stream_split_text\n", "pdf_file = \"vertical_header.pdf\"\n", "\n", "# pdf_file, kwargs = \"vertical_header.pdf\", {\"pages\": \"2\"}\n", "\n", "# pdf_file, kwargs = \"PIR_Prospetto.dOfferta.pdf\", {\"pages\": \"6\"}\n", "# pdf_file = \"twotables_2.pdf\" # Lattice is better\n", "# pdf_file = \"camelot-issue-132-multiple-tables.pdf\"\n", "# pdf_file, kwargs, data = \"edge_tol.pdf\", {\"edge_tol\": 500}, data_stream_edge_tol\n", "# pdf_file, kwargs, data = \"edge_tol.pdf\", {}, data_stream_edge_tol\n", "# pdf_file, kwargs = \"tabula/icdar2013-dataset/competition-dataset-us/us-030.pdf\", {\"pages\": \"2\"} # test_lattice\n", "# pdf_file, kwargs = \"background_lines_1.pdf\", {\"process_background\": True} # test_lattice_process_background\n", "\n", "filename = os.path.join(\n", " os.path.dirname(os.path.abspath('.')),\n", " \"camelot/tests/files\",\n", " pdf_file\n", ")\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "tags": [] }, "outputs": [], "source": [ "FLAVORS = [\"stream\", \"lattice\", \"network\", \"hybrid\"]\n", "tables_parsed = {}\n", "parses = {}\n", "max_tables = 0\n", "for idx, flavor in enumerate(FLAVORS):\n", " timer_before_parse = time.perf_counter()\n", " error, tables = None, []\n", " try:\n", " tables = camelot.read_pdf(filename, flavor=flavor, debug=True, **kwargs)\n", " except ValueError as value_error:\n", " error = f\"Invalid argument for parser {flavor}: {value_error}\"\n", " print(error)\n", " timer_after_parse = time.perf_counter()\n", " max_tables = max(max_tables, len(tables))\n", "\n", " parses[flavor] = {\n", " \"tables\": tables,\n", " \"time\": timer_after_parse - timer_before_parse,\n", " \"error\": error\n", " }\n", "\n", " print(f\"##### {flavor} ####\")\n", " print(f\"Found {len(tables)} table(s):\")\n", " for idx, table in enumerate(tables):\n", " flavors_matching = []\n", " for previous_flavor, previous_tables in tables_parsed.items():\n", " for prev_idx, previous_table in enumerate(previous_tables):\n", " if previous_table.df.equals(table.df):\n", " flavors_matching.append(\n", " f\"{previous_flavor} table {prev_idx}\")\n", " print(f\"## Table {idx} ##\")\n", " if flavors_matching:\n", " print(f\"Same as {', '.join(flavors_matching)}.\")\n", " else:\n", " display(table.df)\n", " print(\"\")\n", " tables_parsed[flavor] = tables\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Show tables layout within original document" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "tags": [] }, "outputs": [], "source": [ "\n", "# Set up plotting options\n", "import matplotlib.pyplot as plt\n", "%matplotlib inline\n", "PLOT_HEIGHT = 12\n", "\n", "row_count = max(max_tables, 1)\n", "plt.rcParams[\"figure.figsize\"] = [PLOT_HEIGHT * len(FLAVORS), PLOT_HEIGHT * row_count]\n", "fig, axes = plt.subplots(row_count, len(FLAVORS))\n", "plt.subplots_adjust(wspace=0, hspace=0) # Reduce margins to maximize the display zone\n", "\n", "fig.suptitle('Side-by-side flavor comparison', fontsize=24, fontweight='bold')\n", "for idx, flavor in enumerate(FLAVORS):\n", " parse = parses[flavor]\n", " tables = parse[\"tables\"]\n", " top_ax = axes.flat[idx]\n", " title = f\"{flavor}\\n\" \\\n", " f\"Detected {len(tables)} table(s) in {parse['time']:.2f}s\"\n", " if parse['error']:\n", " title = title + f\"\\nError parsing: {parse['error']}\"\n", " top_ax.set_title(title, fontsize=12, fontweight='bold')\n", " for table_idx, table in enumerate(tables):\n", " if max_tables > 1:\n", " ax = axes[table_idx][idx]\n", " else:\n", " ax = axes[idx]\n", " fig = camelot.plot(table, kind='grid', ax=ax)\n", " ax.text(\n", " 0.5,-0.1, \n", " \"{flavor} table {table_idx} - {rows}x{cols}\".format(\n", " flavor=flavor,\n", " table_idx=table_idx,\n", " rows=table.shape[0],\n", " cols=table.shape[1],\n", " ), \n", " size=14, ha=\"center\", \n", " transform=ax.transAxes\n", " )\n", " timer_after_plot = time.perf_counter()" ] } ], "metadata": { "language_info": { "name": "python", "codemirror_mode": { "name": "ipython", "version": 3 }, "version": "3.7.7-final" }, "orig_nbformat": 2, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "npconvert_exporter": "python", "pygments_lexer": "ipython3", "version": 3, "kernelspec": { "name": "python37764bit8418972e58f441528b05b4b21a1f095d", "display_name": "Python 3.7.7 64-bit" } }, "nbformat": 4, "nbformat_minor": 2 }