172 lines
7.8 KiB
Plaintext
172 lines
7.8 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"# Common import and setup\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 1,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"output_type": "execute_result",
|
|
"data": {
|
|
"text/plain": "'/Users/francoishuet/Code/camelot/camelot/__init__.py'"
|
|
},
|
|
"metadata": {},
|
|
"execution_count": 1
|
|
}
|
|
],
|
|
"source": [
|
|
"import os, sys, time, pytest\n",
|
|
"\n",
|
|
"import matplotlib.pyplot as plt\n",
|
|
"from matplotlib import patches, lines\n",
|
|
"import numpy as np\n",
|
|
"import pandas as pd\n",
|
|
"from pandas.testing import assert_frame_equal\n",
|
|
"\n",
|
|
"import pdfminer\n",
|
|
"\n",
|
|
"from IPython.display import display\n",
|
|
"\n",
|
|
"# Make sure we use the local version of camelot if it is here\n",
|
|
"sys.path.insert(0, os.path.abspath(''))\n",
|
|
"\n",
|
|
"import camelot\n",
|
|
"from camelot.core import Table, TableList, TextEdges\n",
|
|
"from camelot.__version__ import generate_version\n",
|
|
"from camelot.utils import get_text_objects, text_in_bbox\n",
|
|
"from camelot.parsers.stream import Stream\n",
|
|
"from camelot.parsers.lattice import Lattice\n",
|
|
"from camelot.parsers.network import Network\n",
|
|
"from camelot.parsers.hybrid import Hybrid\n",
|
|
"from camelot.handlers import PDFHandler\n",
|
|
"from camelot.plotting import draw_pdf\n",
|
|
"from tests.data import *\n",
|
|
"\n",
|
|
"testdir = os.path.dirname(os.path.abspath('.'))\n",
|
|
"testdir = os.path.join(testdir, \"camelot/tests/files\")\n",
|
|
"\n",
|
|
"# To check which library we're using\n",
|
|
"camelot.__file__\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 2,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"kwargs = {}\n",
|
|
"data = None\n",
|
|
"# pdf_file = \"vertical_header.pdf\"\n",
|
|
"# pdf_file, kwargs, data = \"superscript.pdf\", {\"flag_size\": True}, data_stream_flag_size # test_hybrid_flag_size\n",
|
|
"# pdf_file = \"health.pdf\" # test_hybrid\n",
|
|
"# pdf_file = \"clockwise_table_2.pdf\"\n",
|
|
"# pdf_file = \"tabula/12s0324.pdf\" # interesting because contains two separate tables\n",
|
|
"# pdf_file = \"clockwise_table_2.pdf\" # test_hybrid_table_rotated / test_stream_table_rotated\n",
|
|
"# pdf_file, kwargs = \"tabula/us-007.pdf\", {\"table_regions\": [\"320,335,573,505\"]} # test_hybrid_table_regions\n",
|
|
"# pdf_file, kwargs = \"detect_vertical_false.pdf\", {\"strip_text\": \" ,\\n\"} # data_stream_strip_text\n",
|
|
"# pdf_file, kwargs, data = \"tabula/m27.pdf\", {\"columns\": [\"72,95,209,327,442,529,566,606,683\"], \"split_text\": True, }, data_stream_split_text # data_stream_split_text\n",
|
|
"pdf_file = \"vertical_header.pdf\"\n",
|
|
"# pdf_file, kwargs = \"vertical_header.pdf\", {\"pages\": \"2\"}\n",
|
|
"# pdf_file, kwargs = \"PIR_Prospetto.dOfferta.pdf\", {\"pages\": \"6\"}\n",
|
|
"# pdf_file = \"twotables_2.pdf\"\n",
|
|
"# pdf_file = \"camelot-issue-132-multiple-tables.pdf\"\n",
|
|
"# pdf_file, kwargs, data = \"edge_tol.pdf\", {\"edge_tol\": 500}, data_stream_edge_tol\n",
|
|
"# pdf_file, kwargs, data = \"edge_tol.pdf\", {}, data_stream_edge_tol\n",
|
|
"# pdf_file, kwargs = \"tabula/icdar2013-dataset/competition-dataset-us/us-030.pdf\", {\"pages\": \"2\"} # test_lattice\n",
|
|
"# pdf_file, kwargs = \"background_lines_1.pdf\", {} # {\"process_background\": True} # test_lattice_process_background\n",
|
|
"\n",
|
|
"filename = os.path.join(testdir, pdf_file)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 3,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"output_type": "error",
|
|
"ename": "NameError",
|
|
"evalue": "name 'parsers' is not defined",
|
|
"traceback": [
|
|
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
|
"\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
|
|
"\u001b[0;32m<ipython-input-3-db47c82b54fd>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m 6\u001b[0m \u001b[0mfig\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msuptitle\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'Side-by-side Flavor Review'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 7\u001b[0m \u001b[0mtables_list\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 8\u001b[0;31m \u001b[0;32mfor\u001b[0m \u001b[0midx\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mflavor\u001b[0m \u001b[0;32min\u001b[0m \u001b[0menumerate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mparsers\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 9\u001b[0m \u001b[0mtimer_before_parse\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtime\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mperf_counter\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 10\u001b[0m \u001b[0mtables\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcamelot\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread_pdf\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfilename\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mflavor\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mflavor\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdebug\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
|
|
"\u001b[0;31mNameError\u001b[0m: name 'parsers' is not defined"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"\n",
|
|
"# Set up plots to be large enough for visualization\n",
|
|
"PARSERS = [\"stream\", \"lattice\", \"network\", \"hybrid\"]\n",
|
|
"PLOT_HEIGHT = 12\n",
|
|
"plt.rcParams[\"figure.figsize\"] = [PLOT_HEIGHT * len(PARSERS), PLOT_HEIGHT]\n",
|
|
"fig, axes = plt.subplots(1, len(PARSERS))\n",
|
|
"fig.suptitle('Side-by-side Flavor Review')\n",
|
|
"tables_list = []\n",
|
|
"for idx, flavor in enumerate(PARSERS):\n",
|
|
" timer_before_parse = time.perf_counter()\n",
|
|
" tables = camelot.read_pdf(filename, flavor=flavor, debug=True, **kwargs)\n",
|
|
" tables_list.append(tables)\n",
|
|
" timer_after_parse = time.perf_counter()\n",
|
|
" ax = axes[idx]\n",
|
|
" for idx_table, table in enumerate(tables):\n",
|
|
" print(\"Showing table #{idx} found by {flavor}:\".format(idx=idx_table, flavor=flavor))\n",
|
|
" display(table.df)\n",
|
|
" fig = camelot.plot(table, kind='grid', ax=ax)\n",
|
|
" ax.set_title(\"{flavor}\".format(flavor=flavor))\n",
|
|
" tables_dims = \", \".join(\n",
|
|
" map(\n",
|
|
" lambda table: \"{rows}x{cols}\".format(\n",
|
|
" rows=table.shape[0],\n",
|
|
" cols=table.shape[1],\n",
|
|
" ), tables\n",
|
|
" )\n",
|
|
" )\n",
|
|
" ax.text(\n",
|
|
" 0.5,-0.1, \n",
|
|
" \"Found {table_num} tables ({tables_dims}) in {parse_time:.2f}s\".format(\n",
|
|
" table_num=len(tables),\n",
|
|
" tables_dims=tables_dims,\n",
|
|
" parse_time=timer_after_parse - timer_before_parse,\n",
|
|
" ), \n",
|
|
" size=12, ha=\"center\", \n",
|
|
" transform=ax.transAxes\n",
|
|
" )\n",
|
|
" timer_after_plot = time.perf_counter()\n",
|
|
"fig"
|
|
]
|
|
}
|
|
],
|
|
"metadata": {
|
|
"language_info": {
|
|
"name": "python",
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"version": "3.7.7-final"
|
|
},
|
|
"orig_nbformat": 2,
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"npconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": 3,
|
|
"kernelspec": {
|
|
"name": "python37764bit8418972e58f441528b05b4b21a1f095d",
|
|
"display_name": "Python 3.7.7 64-bit"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 2
|
|
} |