{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Common import and setup\n" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": "'/Users/francoishuet/Code/camelot/camelot/__init__.py'" }, "metadata": {}, "execution_count": 1 } ], "source": [ "import os, sys, time, pytest\n", "\n", "import matplotlib.pyplot as plt\n", "from matplotlib import patches, lines\n", "import numpy as np\n", "import pandas as pd\n", "from pandas.testing import assert_frame_equal\n", "\n", "import pdfminer\n", "\n", "from IPython.display import display\n", "\n", "# Make sure we use the local version of camelot if it is here\n", "sys.path.insert(0, os.path.abspath(''))\n", "\n", "import camelot\n", "from camelot.core import Table, TableList, TextEdges\n", "from camelot.__version__ import generate_version\n", "from camelot.utils import get_text_objects, text_in_bbox\n", "from camelot.parsers.stream import Stream\n", "from camelot.parsers.hybrid import Hybrid\n", "from camelot.handlers import PDFHandler\n", "from camelot.plotting import draw_pdf\n", "from tests.data import *\n", "\n", "testdir = os.path.dirname(os.path.abspath('.'))\n", "testdir = os.path.join(testdir, \"camelot/tests/files\")\n", "\n", "# Set up plots to be large enough for visualization\n", "\n", "# To check which library we're using\n", "camelot.__file__\n" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "kwargs = {}\n", "data = None\n", "# pdf_file = \"vertical_header.pdf\" # test_hybrid_vertical_header\n", "# pdf_file, kwargs = \"background_lines_1.pdf\", {} # {\"process_background\": True} # test_lattice_process_background\n", "\n", "# pdf_file, kwargs, data = \"superscript.pdf\", {\"flag_size\": True}, data_stream_flag_size # test_hybrid_flag_size\n", "# pdf_file = \"health.pdf\" # test_hybrid\n", "# pdf_file = \"clockwise_table_2.pdf\"\n", "# pdf_file = \"tabula/12s0324.pdf\" # interesting because contains two separate tables\n", "# pdf_file, kwargs = \"tabula/us-007.pdf\", {\"table_regions\": [\"320,335,573,505\"]} # test_hybrid_table_regions\n", "# pdf_file, kwargs = \"detect_vertical_false.pdf\", {\"strip_text\": \" ,\\n\"} # data_stream_strip_text\n", "# pdf_file, kwargs, data = \"tabula/m27.pdf\", {\"columns\": [\"72,95,209,327,442,529,566,606,683\"], \"split_text\": True, }, data_stream_split_text # data_stream_split_text\n", "\n", "pdf_file = \"vertical_header.pdf\"\n", "\n", "# pdf_file = \"twotables_2.pdf\"\n", "# pdf_file = \"camelot-issue-132-multiple-tables.pdf\"\n", "# pdf_file, kwargs, data = \"edge_tol.pdf\", {\"edge_tol\": 500}, data_stream_edge_tol\n", "# pdf_file, kwargs, data = \"edge_tol.pdf\", {}, data_stream_edge_tol\n", "\n", "filename = os.path.join(testdir, pdf_file)" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "output_type": "stream", "name": "stdout", "text": "Found 1 tables (17x22) in 0.77s\n" }, { "output_type": "display_data", "data": { "text/plain": " 0 1 2 \\\n0 \n1 \n2 \n3 Number of Registered voters Poll Book Totals \n4 Alcona 963 439 \n5 Caledonia 923 393 \n6 Curtis 1026 349 \n7 Greenbush 1212 423 \n8 Gustin 611 180 \n9 Harrisville 1142 430 \n10 Hawes 884 293 \n11 Haynes 626 275 \n12 Mikado 781 208 \n13 Millen 353 139 \n14 Mitchell 327 96 \n15 City Harrisville 389 171 \n16 Totals 9237 3396 \n\n 3 4 5 6 7 \\\n0 \n1 \n2 Governor \n3 Brian Calley Patrick Colbeck Jim Hines Bill Schuette John James \n4 55 26 47 164 173 \n5 40 23 45 158 150 \n6 30 30 25 102 95 \n7 56 26 40 126 104 \n8 22 35 17 55 73 \n9 45 90 29 101 155 \n10 38 36 27 109 121 \n11 31 20 32 104 121 \n12 19 39 17 81 90 \n13 7 16 13 38 49 \n14 12 17 7 29 41 \n15 16 15 18 35 49 \n16 371 373 317 1102 1221 \n\n 8 9 ... 12 13 14 \\\n0 ... Senator 36th Rep106th \n1 ... Dist. Dist. \n2 U.S. Senator ... \n3 Sandy Pensler ... Jim Stamas Sue Allor Melissa A. Cordes \n4 111 ... 272 275 269 \n5 103 ... 247 254 255 \n6 84 ... 164 162 161 \n7 131 ... 213 214 215 \n8 45 ... 104 111 111 \n9 94 ... 226 232 244 \n10 84 ... 195 195 193 \n11 53 ... 163 173 161 \n12 63 ... 149 145 147 \n13 19 ... 66 67 66 \n14 17 ... 55 57 60 \n15 31 ... 80 82 81 \n16 835 0 ... 1934 1967 1963 \n\n 15 16 17 18 19 \\\n0 Reg. of Road \n1 Deeds Commission District #1 \n2 \n3 Al Scully Daniel G. Gauthier Craig M. Clemens \n4 271 224 76 \n5 244 139 143 \n6 157 \n7 208 \n8 109 \n9 226 \n10 184 \n11 152 \n12 143 \n13 62 \n14 56 \n15 77 \n16 0 1889 0 363 219 \n\n 20 21 \n0 Distri Dist \n1 ct #2 #3\\nDist #4 \n2 \n3 Craig Johnston Carolyn Brummund\\nAdam Brege\\nDavid Bielusiak \n4 \n5 \n6 \n7 208 \n8 81\\n42 \n9 232 \n10 118\\n87 \n11 76 69\\n31 \n12 113 \n13 \n14 \n15 73 \n16 381 321\\n268\\n160 \n\n[17 rows x 22 columns]", "text/html": "
| \n | 0 | \n1 | \n2 | \n3 | \n4 | \n5 | \n6 | \n7 | \n8 | \n9 | \n... | \n12 | \n13 | \n14 | \n15 | \n16 | \n17 | \n18 | \n19 | \n20 | \n21 | \n
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | \n\n | \n | \n | \n | \n | \n | \n | \n | \n | \n | ... | \nSenator 36th | \nRep106th | \n\n | Reg. of | \n\n | Road | \n\n | \n | Distri | \nDist | \n
| 1 | \n\n | \n | \n | \n | \n | \n | \n | \n | \n | \n | ... | \nDist. | \nDist. | \n\n | Deeds | \n\n | Commission | \n\n | District #1 | \nct #2 | \n#3\\nDist #4 | \n
| 2 | \n\n | \n | \n | \n | \n | Governor | \n\n | \n | U.S. Senator | \n\n | ... | \n\n | \n | \n | \n | \n | \n | \n | \n | \n | \n |
| 3 | \n\n | Number of Registered voters | \nPoll Book Totals | \nBrian Calley | \nPatrick Colbeck | \nJim Hines | \nBill Schuette | \nJohn James | \nSandy Pensler | \n\n | ... | \nJim Stamas | \nSue Allor | \nMelissa A. Cordes | \n\n | Al Scully | \n\n | Daniel G. Gauthier | \nCraig M. Clemens | \nCraig Johnston | \nCarolyn Brummund\\nAdam Brege\\nDavid Bielusiak | \n
| 4 | \nAlcona | \n963 | \n439 | \n55 | \n26 | \n47 | \n164 | \n173 | \n111 | \n\n | ... | \n272 | \n275 | \n269 | \n\n | 271 | \n\n | 224 | \n76 | \n\n | \n |
| 5 | \nCaledonia | \n923 | \n393 | \n40 | \n23 | \n45 | \n158 | \n150 | \n103 | \n\n | ... | \n247 | \n254 | \n255 | \n\n | 244 | \n\n | 139 | \n143 | \n\n | \n |
| 6 | \nCurtis | \n1026 | \n349 | \n30 | \n30 | \n25 | \n102 | \n95 | \n84 | \n\n | ... | \n164 | \n162 | \n161 | \n\n | 157 | \n\n | \n | \n | \n | \n |
| 7 | \nGreenbush | \n1212 | \n423 | \n56 | \n26 | \n40 | \n126 | \n104 | \n131 | \n\n | ... | \n213 | \n214 | \n215 | \n\n | 208 | \n\n | \n | \n | \n | 208 | \n
| 8 | \nGustin | \n611 | \n180 | \n22 | \n35 | \n17 | \n55 | \n73 | \n45 | \n\n | ... | \n104 | \n111 | \n111 | \n\n | 109 | \n\n | \n | \n | \n | 81\\n42 | \n
| 9 | \nHarrisville | \n1142 | \n430 | \n45 | \n90 | \n29 | \n101 | \n155 | \n94 | \n\n | ... | \n226 | \n232 | \n244 | \n\n | 226 | \n\n | \n | \n | 232 | \n\n |
| 10 | \nHawes | \n884 | \n293 | \n38 | \n36 | \n27 | \n109 | \n121 | \n84 | \n\n | ... | \n195 | \n195 | \n193 | \n\n | 184 | \n\n | \n | \n | \n | 118\\n87 | \n
| 11 | \nHaynes | \n626 | \n275 | \n31 | \n20 | \n32 | \n104 | \n121 | \n53 | \n\n | ... | \n163 | \n173 | \n161 | \n\n | 152 | \n\n | \n | \n | 76 | \n69\\n31 | \n
| 12 | \nMikado | \n781 | \n208 | \n19 | \n39 | \n17 | \n81 | \n90 | \n63 | \n\n | ... | \n149 | \n145 | \n147 | \n\n | 143 | \n\n | \n | \n | \n | 113 | \n
| 13 | \nMillen | \n353 | \n139 | \n7 | \n16 | \n13 | \n38 | \n49 | \n19 | \n\n | ... | \n66 | \n67 | \n66 | \n\n | 62 | \n\n | \n | \n | \n | \n |
| 14 | \nMitchell | \n327 | \n96 | \n12 | \n17 | \n7 | \n29 | \n41 | \n17 | \n\n | ... | \n55 | \n57 | \n60 | \n\n | 56 | \n\n | \n | \n | \n | \n |
| 15 | \nCity Harrisville | \n389 | \n171 | \n16 | \n15 | \n18 | \n35 | \n49 | \n31 | \n\n | ... | \n80 | \n82 | \n81 | \n\n | 77 | \n\n | \n | \n | 73 | \n\n |
| 16 | \nTotals | \n9237 | \n3396 | \n371 | \n373 | \n317 | \n1102 | \n1221 | \n835 | \n0 | \n... | \n1934 | \n1967 | \n1963 | \n0 | \n1889 | \n0 | \n363 | \n219 | \n381 | \n321\\n268\\n160 | \n
17 rows × 22 columns
\n