camelot-py/parser-comparison-notebook....

{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Common import and setup\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": "'/Users/francoishuet/Code/camelot/camelot/__init__.py'"
     },
     "metadata": {},
     "execution_count": 1
    }
   ],
   "source": [
    "import os, sys, time, pytest\n",
    "\n",
    "import matplotlib.pyplot as plt\n",
    "from matplotlib import patches, lines\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "from pandas.testing import assert_frame_equal\n",
    "\n",
    "import pdfminer\n",
    "\n",
    "from IPython.display import display\n",
    "\n",
    "# Make sure we use the local version of camelot if it is here\n",
    "sys.path.insert(0, os.path.abspath(''))\n",
    "\n",
    "import camelot\n",
    "from camelot.core import Table, TableList, TextEdges\n",
    "from camelot.__version__ import generate_version\n",
    "from camelot.utils import get_text_objects, text_in_bbox\n",
    "from camelot.parsers.stream import Stream\n",
    "from camelot.parsers.lattice import Lattice\n",
    "from camelot.parsers.network import Network\n",
    "from camelot.parsers.hybrid import Hybrid\n",
    "from camelot.handlers import PDFHandler\n",
    "from camelot.plotting import draw_pdf\n",
    "from tests.data import *\n",
    "\n",
    "testdir = os.path.dirname(os.path.abspath('.'))\n",
    "testdir = os.path.join(testdir, \"camelot/tests/files\")\n",
    "\n",
    "# To check which library we're using\n",
    "camelot.__file__\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "kwargs = {}\n",
    "data = None\n",
    "# pdf_file = \"vertical_header.pdf\"\n",
    "# pdf_file, kwargs, data = \"superscript.pdf\", {\"flag_size\": True}, data_stream_flag_size # test_hybrid_flag_size\n",
    "# pdf_file = \"health.pdf\"  # test_hybrid\n",
    "# pdf_file = \"clockwise_table_2.pdf\"\n",
    "# pdf_file = \"tabula/12s0324.pdf\" # interesting because contains two separate tables\n",
    "# pdf_file = \"clockwise_table_2.pdf\"  # test_hybrid_table_rotated / test_stream_table_rotated\n",
    "# pdf_file, kwargs = \"tabula/us-007.pdf\", {\"table_regions\": [\"320,335,573,505\"]} # test_hybrid_table_regions\n",
    "# pdf_file, kwargs = \"detect_vertical_false.pdf\", {\"strip_text\": \" ,\\n\"}  # data_stream_strip_text\n",
    "# pdf_file, kwargs, data = \"tabula/m27.pdf\", {\"columns\": [\"72,95,209,327,442,529,566,606,683\"], \"split_text\": True, }, data_stream_split_text  # data_stream_split_text\n",
    "pdf_file = \"vertical_header.pdf\"\n",
    "# pdf_file, kwargs = \"vertical_header.pdf\", {\"pages\": \"2\"}\n",
    "# pdf_file, kwargs = \"PIR_Prospetto.dOfferta.pdf\", {\"pages\": \"6\"}\n",
    "# pdf_file = \"twotables_2.pdf\"\n",
    "# pdf_file = \"camelot-issue-132-multiple-tables.pdf\"\n",
    "# pdf_file, kwargs, data = \"edge_tol.pdf\", {\"edge_tol\": 500}, data_stream_edge_tol\n",
    "# pdf_file, kwargs, data = \"edge_tol.pdf\", {}, data_stream_edge_tol\n",
    "# pdf_file, kwargs = \"tabula/icdar2013-dataset/competition-dataset-us/us-030.pdf\", {\"pages\": \"2\"}  # test_lattice\n",
    "# pdf_file, kwargs = \"background_lines_1.pdf\", {} # {\"process_background\": True}  # test_lattice_process_background\n",
    "\n",
    "filename = os.path.join(testdir, pdf_file)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "output_type": "error",
     "ename": "NameError",
     "evalue": "name 'parsers' is not defined",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
      "\u001b[0;32m<ipython-input-3-db47c82b54fd>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m      6\u001b[0m \u001b[0mfig\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msuptitle\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'Side-by-side Flavor Review'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      7\u001b[0m \u001b[0mtables_list\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 8\u001b[0;31m \u001b[0;32mfor\u001b[0m \u001b[0midx\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mflavor\u001b[0m \u001b[0;32min\u001b[0m \u001b[0menumerate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mparsers\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      9\u001b[0m     \u001b[0mtimer_before_parse\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtime\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mperf_counter\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     10\u001b[0m     \u001b[0mtables\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcamelot\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread_pdf\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfilename\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mflavor\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mflavor\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdebug\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;31mNameError\u001b[0m: name 'parsers' is not defined"
     ]
    }
   ],
   "source": [
    "\n",
    "# Set up plots to be large enough for visualization\n",
    "PARSERS = [\"stream\", \"lattice\", \"network\", \"hybrid\"]\n",
    "PLOT_HEIGHT = 12\n",
    "plt.rcParams[\"figure.figsize\"] = [PLOT_HEIGHT * len(PARSERS), PLOT_HEIGHT]\n",
    "fig, axes = plt.subplots(1, len(PARSERS))\n",
    "fig.suptitle('Side-by-side Flavor Review')\n",
    "tables_list = []\n",
    "for idx, flavor in enumerate(PARSERS):\n",
    "    timer_before_parse = time.perf_counter()\n",
    "    tables = camelot.read_pdf(filename, flavor=flavor, debug=True, **kwargs)\n",
    "    tables_list.append(tables)\n",
    "    timer_after_parse = time.perf_counter()\n",
    "    ax = axes[idx]\n",
    "    for idx_table, table in enumerate(tables):\n",
    "        print(\"Showing table #{idx} found by {flavor}:\".format(idx=idx_table, flavor=flavor))\n",
    "        display(table.df)\n",
    "        fig = camelot.plot(table, kind='grid', ax=ax)\n",
    "    ax.set_title(\"{flavor}\".format(flavor=flavor))\n",
    "    tables_dims = \", \".join(\n",
    "        map(\n",
    "            lambda table: \"{rows}x{cols}\".format(\n",
    "                rows=table.shape[0],\n",
    "                cols=table.shape[1],\n",
    "            ), tables\n",
    "        )\n",
    "    )\n",
    "    ax.text(\n",
    "        0.5,-0.1, \n",
    "        \"Found {table_num} tables ({tables_dims}) in {parse_time:.2f}s\".format(\n",
    "            table_num=len(tables),\n",
    "            tables_dims=tables_dims,\n",
    "            parse_time=timer_after_parse - timer_before_parse,\n",
    "        ), \n",
    "        size=12, ha=\"center\", \n",
    "        transform=ax.transAxes\n",
    "    )\n",
    "    timer_after_plot = time.perf_counter()\n",
    "fig"
   ]
  }
 ],
 "metadata": {
  "language_info": {
   "name": "python",
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "version": "3.7.7-final"
  },
  "orig_nbformat": 2,
  "file_extension": ".py",
  "mimetype": "text/x-python",
  "name": "python",
  "npconvert_exporter": "python",
  "pygments_lexer": "ipython3",
  "version": 3,
  "kernelspec": {
   "name": "python37764bit8418972e58f441528b05b4b21a1f095d",
   "display_name": "Python 3.7.7 64-bit"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}