Added a question and answer in FAQ for optimizing the memory usage
parent
8e69c75c9c
commit
477632bee8
|
|
@ -3,5 +3,148 @@
|
||||||
FAQ
|
FAQ
|
||||||
===
|
===
|
||||||
|
|
||||||
How an accuracy is calculated ?
|
This part of the documentation answers some common questions. If you want to add some questions you can simply open as issue `here <https://github.com/camelot-dev/camelot/issues/new>`_.
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
|
How could you Optimize memory usage for long PDFs ?
|
||||||
|
---------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
In order to optimize memory usage you need to flush tables every ``n`` pages. For more information refer this snippet of code from `@anakin87 <https://github.com/anakin87>`_.
|
||||||
|
|
||||||
|
.. code-block:: python3
|
||||||
|
|
||||||
|
|
||||||
|
# These long PDF contain regional balance sheets.
|
||||||
|
# Code (chunked extraction) is similar to this:
|
||||||
|
|
||||||
|
from main import logger
|
||||||
|
import camelot
|
||||||
|
import shutil
|
||||||
|
import pathlib
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import glob
|
||||||
|
|
||||||
|
def chunks(l, n):
|
||||||
|
"""Yield successive n-sized chunks from l."""
|
||||||
|
for i in range(0, len(l), n):
|
||||||
|
yield l[i:i + n]
|
||||||
|
|
||||||
|
def extract_tables_from_path(filename,pages,params=None):
|
||||||
|
|
||||||
|
std_params_set = {
|
||||||
|
'flavor': 'lattice',
|
||||||
|
'line_scale': 65,
|
||||||
|
'copy_text': ['h', 'v'],
|
||||||
|
'split_text': True
|
||||||
|
}
|
||||||
|
|
||||||
|
# keys to export in JSON
|
||||||
|
selected_keys = ['rows',
|
||||||
|
'whitespace',
|
||||||
|
'_bbox',
|
||||||
|
'cols',
|
||||||
|
'page',
|
||||||
|
'shape',
|
||||||
|
'flavor',
|
||||||
|
'order',
|
||||||
|
'accuracy']
|
||||||
|
|
||||||
|
logger.info('\n\n' + '*' * 50 + 'START' + '*' * 50)
|
||||||
|
logger.info('WORKING ON FILE {}'.format(filename))
|
||||||
|
|
||||||
|
|
||||||
|
dir_name = filename.rpartition('/')[-1].rpartition('.')[0]
|
||||||
|
dir_path = '/comuni-data/exp_tables/' + dir_name
|
||||||
|
dir_temp=dir_path+'/temp'
|
||||||
|
|
||||||
|
# Clean and recreate output directory
|
||||||
|
try:
|
||||||
|
shutil.rmtree(dir_path, ignore_errors=True)
|
||||||
|
pathlib.Path(dir_temp).mkdir(parents=True, exist_ok=True)
|
||||||
|
except:
|
||||||
|
logger.exception('Error in cleaning/creating output directory')
|
||||||
|
return None
|
||||||
|
|
||||||
|
params_set = params if params else std_params_set
|
||||||
|
|
||||||
|
logger.info('USING THE FOLLOWING PARAMETERS: {}'.format(params_set))
|
||||||
|
|
||||||
|
|
||||||
|
# Control page number (by Camelot method)
|
||||||
|
# and subdivide the extraction work into 50-pages parts.
|
||||||
|
# AT THE END OF EVERY PART, SAVE THE DATA ON DISk AND FREE RAM
|
||||||
|
|
||||||
|
handler=camelot.handlers.PDFHandler(filename)
|
||||||
|
pages_list=handler._get_pages(filename,pages=pages)
|
||||||
|
|
||||||
|
pages_chunks=list(chunks(pages_list,50))
|
||||||
|
|
||||||
|
last_index=0
|
||||||
|
tot_tables=0
|
||||||
|
index=0
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
for chunk in pages_chunks:
|
||||||
|
tables=[]
|
||||||
|
new_table_list=[]
|
||||||
|
|
||||||
|
pages_string=str(chunk).replace('[','').replace(']','')
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
try:
|
||||||
|
tables = camelot.read_pdf(filename, pages=pages_string, **params_set)
|
||||||
|
except Exception as e:
|
||||||
|
logger.exception('ERROR IN TABLE EXTRACTION')
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# First filter
|
||||||
|
new_table_list =[table for table in tables if table.shape != (1, 1)]
|
||||||
|
|
||||||
|
# Second filter
|
||||||
|
|
||||||
|
new_table_list = [table for table in new_table_list if (table.parsing_report['accuracy'] > 75 \
|
||||||
|
or table.parsing_report['accuracy'] < 0) \
|
||||||
|
and table.parsing_report['whitespace'] < 80\
|
||||||
|
and '(cid:' not in str(table.data)]
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
logger.info('SAVING EXTRACTION')
|
||||||
|
|
||||||
|
# Exports in JSON the selected fields
|
||||||
|
tables_bboxes = []
|
||||||
|
|
||||||
|
for index, table in enumerate(new_table_list,last_index):
|
||||||
|
table_dict = {key: table.__dict__[key] for key in selected_keys}
|
||||||
|
|
||||||
|
table_dict['id'] = index
|
||||||
|
table_dict['filepath'] = filename
|
||||||
|
table_dict['json_data'] = table.__dict__['df'].to_json()
|
||||||
|
|
||||||
|
table_filename = '{}/table-{}.json'.format(dir_path, index)
|
||||||
|
|
||||||
|
|
||||||
|
with open(table_filename, "w") as file:
|
||||||
|
json.dump(table_dict, file)
|
||||||
|
|
||||||
|
|
||||||
|
last_index=index
|
||||||
|
tot_tables+=len(new_table_list)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
logger.info('{} VALID TABLES DETECTED'.format(tot_tables))
|
||||||
|
logger.info('*' * 50 + 'END' + '*' * 50)
|
||||||
|
|
||||||
|
api_response=ApiResponse(n_of_valid_tables=tot_tables,output_directory=str(pathlib.Path(dir_path).resolve()))
|
||||||
|
|
||||||
|
|
||||||
|
return api_response
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue