{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Common import and setup\n" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": "'/Users/francoishuet/Code/camelot/camelot/__init__.py'" }, "metadata": {}, "execution_count": 1 } ], "source": [ "import os, sys, time, pytest\n", "\n", "import matplotlib.pyplot as plt\n", "from matplotlib import patches, lines\n", "import numpy as np\n", "import pandas as pd\n", "from pandas.testing import assert_frame_equal\n", "\n", "import pdfminer\n", "\n", "from IPython.display import display\n", "\n", "# Make sure we use the local version of camelot if it is here\n", "sys.path.insert(0, os.path.abspath(''))\n", "\n", "import camelot\n", "from camelot.core import Table, TableList, TextEdges\n", "from camelot.__version__ import generate_version\n", "from camelot.utils import get_text_objects, text_in_bbox\n", "from camelot.parsers.stream import Stream\n", "from camelot.parsers.lattice import Lattice\n", "from camelot.parsers.network import Network\n", "from camelot.parsers.hybrid import Hybrid\n", "from camelot.handlers import PDFHandler\n", "from camelot.plotting import draw_pdf\n", "from tests.data import *\n", "\n", "testdir = os.path.dirname(os.path.abspath('.'))\n", "testdir = os.path.join(testdir, \"camelot/tests/files\")\n", "\n", "# To check which library we're using\n", "camelot.__file__\n" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "kwargs = {}\n", "data = None\n", "# pdf_file = \"vertical_header.pdf\"\n", "# pdf_file, kwargs, data = \"superscript.pdf\", {\"flag_size\": True}, data_stream_flag_size # test_hybrid_flag_size\n", "# pdf_file = \"health.pdf\" # test_hybrid\n", "# pdf_file = \"clockwise_table_2.pdf\"\n", "# pdf_file = \"tabula/12s0324.pdf\" # interesting because contains two separate tables\n", "# pdf_file = \"clockwise_table_2.pdf\" # test_hybrid_table_rotated / test_stream_table_rotated\n", "# pdf_file, kwargs = \"tabula/us-007.pdf\", {\"table_regions\": [\"320,335,573,505\"]} # test_hybrid_table_regions\n", "# pdf_file, kwargs = \"detect_vertical_false.pdf\", {\"strip_text\": \" ,\\n\"} # data_stream_strip_text\n", "# pdf_file, kwargs, data = \"tabula/m27.pdf\", {\"columns\": [\"72,95,209,327,442,529,566,606,683\"], \"split_text\": True, }, data_stream_split_text # data_stream_split_text\n", "pdf_file = \"vertical_header.pdf\"\n", "# pdf_file, kwargs = \"vertical_header.pdf\", {\"pages\": \"2\"}\n", "# pdf_file, kwargs = \"PIR_Prospetto.dOfferta.pdf\", {\"pages\": \"6\"}\n", "# pdf_file = \"twotables_2.pdf\"\n", "# pdf_file = \"camelot-issue-132-multiple-tables.pdf\"\n", "# pdf_file, kwargs, data = \"edge_tol.pdf\", {\"edge_tol\": 500}, data_stream_edge_tol\n", "# pdf_file, kwargs, data = \"edge_tol.pdf\", {}, data_stream_edge_tol\n", "# pdf_file, kwargs = \"tabula/icdar2013-dataset/competition-dataset-us/us-030.pdf\", {\"pages\": \"2\"} # test_lattice\n", "# pdf_file, kwargs = \"background_lines_1.pdf\", {} # {\"process_background\": True} # test_lattice_process_background\n", "\n", "filename = os.path.join(testdir, pdf_file)" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "output_type": "error", "ename": "NameError", "evalue": "name 'parsers' is not defined", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 6\u001b[0m \u001b[0mfig\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msuptitle\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'Side-by-side Flavor Review'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 7\u001b[0m \u001b[0mtables_list\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 8\u001b[0;31m \u001b[0;32mfor\u001b[0m \u001b[0midx\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mflavor\u001b[0m \u001b[0;32min\u001b[0m \u001b[0menumerate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mparsers\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 9\u001b[0m \u001b[0mtimer_before_parse\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtime\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mperf_counter\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 10\u001b[0m \u001b[0mtables\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcamelot\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread_pdf\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfilename\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mflavor\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mflavor\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdebug\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;31mNameError\u001b[0m: name 'parsers' is not defined" ] } ], "source": [ "\n", "# Set up plots to be large enough for visualization\n", "PARSERS = [\"stream\", \"lattice\", \"network\", \"hybrid\"]\n", "PLOT_HEIGHT = 12\n", "plt.rcParams[\"figure.figsize\"] = [PLOT_HEIGHT * len(PARSERS), PLOT_HEIGHT]\n", "fig, axes = plt.subplots(1, len(PARSERS))\n", "fig.suptitle('Side-by-side Flavor Review')\n", "tables_list = []\n", "for idx, flavor in enumerate(PARSERS):\n", " timer_before_parse = time.perf_counter()\n", " tables = camelot.read_pdf(filename, flavor=flavor, debug=True, **kwargs)\n", " tables_list.append(tables)\n", " timer_after_parse = time.perf_counter()\n", " ax = axes[idx]\n", " for idx_table, table in enumerate(tables):\n", " print(\"Showing table #{idx} found by {flavor}:\".format(idx=idx_table, flavor=flavor))\n", " display(table.df)\n", " fig = camelot.plot(table, kind='grid', ax=ax)\n", " ax.set_title(\"{flavor}\".format(flavor=flavor))\n", " tables_dims = \", \".join(\n", " map(\n", " lambda table: \"{rows}x{cols}\".format(\n", " rows=table.shape[0],\n", " cols=table.shape[1],\n", " ), tables\n", " )\n", " )\n", " ax.text(\n", " 0.5,-0.1, \n", " \"Found {table_num} tables ({tables_dims}) in {parse_time:.2f}s\".format(\n", " table_num=len(tables),\n", " tables_dims=tables_dims,\n", " parse_time=timer_after_parse - timer_before_parse,\n", " ), \n", " size=12, ha=\"center\", \n", " transform=ax.transAxes\n", " )\n", " timer_after_plot = time.perf_counter()\n", "fig" ] } ], "metadata": { "language_info": { "name": "python", "codemirror_mode": { "name": "ipython", "version": 3 }, "version": "3.7.7-final" }, "orig_nbformat": 2, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "npconvert_exporter": "python", "pygments_lexer": "ipython3", "version": 3, "kernelspec": { "name": "python37764bit8418972e58f441528b05b4b21a1f095d", "display_name": "Python 3.7.7 64-bit" } }, "nbformat": 4, "nbformat_minor": 2 }