camelot-py/parser-comparison-notebook....

172 lines
7.8 KiB
Plaintext

{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Common import and setup\n"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": "'/Users/francoishuet/Code/camelot/camelot/__init__.py'"
},
"metadata": {},
"execution_count": 1
}
],
"source": [
"import os, sys, time, pytest\n",
"\n",
"import matplotlib.pyplot as plt\n",
"from matplotlib import patches, lines\n",
"import numpy as np\n",
"import pandas as pd\n",
"from pandas.testing import assert_frame_equal\n",
"\n",
"import pdfminer\n",
"\n",
"from IPython.display import display\n",
"\n",
"# Make sure we use the local version of camelot if it is here\n",
"sys.path.insert(0, os.path.abspath(''))\n",
"\n",
"import camelot\n",
"from camelot.core import Table, TableList, TextEdges\n",
"from camelot.__version__ import generate_version\n",
"from camelot.utils import get_text_objects, text_in_bbox\n",
"from camelot.parsers.stream import Stream\n",
"from camelot.parsers.lattice import Lattice\n",
"from camelot.parsers.network import Network\n",
"from camelot.parsers.hybrid import Hybrid\n",
"from camelot.handlers import PDFHandler\n",
"from camelot.plotting import draw_pdf\n",
"from tests.data import *\n",
"\n",
"testdir = os.path.dirname(os.path.abspath('.'))\n",
"testdir = os.path.join(testdir, \"camelot/tests/files\")\n",
"\n",
"# To check which library we're using\n",
"camelot.__file__\n"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"kwargs = {}\n",
"data = None\n",
"# pdf_file = \"vertical_header.pdf\"\n",
"# pdf_file, kwargs, data = \"superscript.pdf\", {\"flag_size\": True}, data_stream_flag_size # test_hybrid_flag_size\n",
"# pdf_file = \"health.pdf\" # test_hybrid\n",
"# pdf_file = \"clockwise_table_2.pdf\"\n",
"# pdf_file = \"tabula/12s0324.pdf\" # interesting because contains two separate tables\n",
"# pdf_file = \"clockwise_table_2.pdf\" # test_hybrid_table_rotated / test_stream_table_rotated\n",
"# pdf_file, kwargs = \"tabula/us-007.pdf\", {\"table_regions\": [\"320,335,573,505\"]} # test_hybrid_table_regions\n",
"# pdf_file, kwargs = \"detect_vertical_false.pdf\", {\"strip_text\": \" ,\\n\"} # data_stream_strip_text\n",
"# pdf_file, kwargs, data = \"tabula/m27.pdf\", {\"columns\": [\"72,95,209,327,442,529,566,606,683\"], \"split_text\": True, }, data_stream_split_text # data_stream_split_text\n",
"pdf_file = \"vertical_header.pdf\"\n",
"# pdf_file, kwargs = \"vertical_header.pdf\", {\"pages\": \"2\"}\n",
"# pdf_file, kwargs = \"PIR_Prospetto.dOfferta.pdf\", {\"pages\": \"6\"}\n",
"# pdf_file = \"twotables_2.pdf\"\n",
"# pdf_file = \"camelot-issue-132-multiple-tables.pdf\"\n",
"# pdf_file, kwargs, data = \"edge_tol.pdf\", {\"edge_tol\": 500}, data_stream_edge_tol\n",
"# pdf_file, kwargs, data = \"edge_tol.pdf\", {}, data_stream_edge_tol\n",
"# pdf_file, kwargs = \"tabula/icdar2013-dataset/competition-dataset-us/us-030.pdf\", {\"pages\": \"2\"} # test_lattice\n",
"# pdf_file, kwargs = \"background_lines_1.pdf\", {} # {\"process_background\": True} # test_lattice_process_background\n",
"\n",
"filename = os.path.join(testdir, pdf_file)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"output_type": "error",
"ename": "NameError",
"evalue": "name 'parsers' is not defined",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-3-db47c82b54fd>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m 6\u001b[0m \u001b[0mfig\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msuptitle\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'Side-by-side Flavor Review'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 7\u001b[0m \u001b[0mtables_list\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 8\u001b[0;31m \u001b[0;32mfor\u001b[0m \u001b[0midx\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mflavor\u001b[0m \u001b[0;32min\u001b[0m \u001b[0menumerate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mparsers\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 9\u001b[0m \u001b[0mtimer_before_parse\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtime\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mperf_counter\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 10\u001b[0m \u001b[0mtables\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcamelot\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread_pdf\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfilename\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mflavor\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mflavor\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdebug\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mNameError\u001b[0m: name 'parsers' is not defined"
]
}
],
"source": [
"\n",
"# Set up plots to be large enough for visualization\n",
"PARSERS = [\"stream\", \"lattice\", \"network\", \"hybrid\"]\n",
"PLOT_HEIGHT = 12\n",
"plt.rcParams[\"figure.figsize\"] = [PLOT_HEIGHT * len(PARSERS), PLOT_HEIGHT]\n",
"fig, axes = plt.subplots(1, len(PARSERS))\n",
"fig.suptitle('Side-by-side Flavor Review')\n",
"tables_list = []\n",
"for idx, flavor in enumerate(PARSERS):\n",
" timer_before_parse = time.perf_counter()\n",
" tables = camelot.read_pdf(filename, flavor=flavor, debug=True, **kwargs)\n",
" tables_list.append(tables)\n",
" timer_after_parse = time.perf_counter()\n",
" ax = axes[idx]\n",
" for idx_table, table in enumerate(tables):\n",
" print(\"Showing table #{idx} found by {flavor}:\".format(idx=idx_table, flavor=flavor))\n",
" display(table.df)\n",
" fig = camelot.plot(table, kind='grid', ax=ax)\n",
" ax.set_title(\"{flavor}\".format(flavor=flavor))\n",
" tables_dims = \", \".join(\n",
" map(\n",
" lambda table: \"{rows}x{cols}\".format(\n",
" rows=table.shape[0],\n",
" cols=table.shape[1],\n",
" ), tables\n",
" )\n",
" )\n",
" ax.text(\n",
" 0.5,-0.1, \n",
" \"Found {table_num} tables ({tables_dims}) in {parse_time:.2f}s\".format(\n",
" table_num=len(tables),\n",
" tables_dims=tables_dims,\n",
" parse_time=timer_after_parse - timer_before_parse,\n",
" ), \n",
" size=12, ha=\"center\", \n",
" transform=ax.transAxes\n",
" )\n",
" timer_after_plot = time.perf_counter()\n",
"fig"
]
}
],
"metadata": {
"language_info": {
"name": "python",
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"version": "3.7.7-final"
},
"orig_nbformat": 2,
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"npconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": 3,
"kernelspec": {
"name": "python37764bit8418972e58f441528b05b4b21a1f095d",
"display_name": "Python 3.7.7 64-bit"
}
},
"nbformat": 4,
"nbformat_minor": 2
}