.. _faq: FAQ === This part of the documentation answers some common questions. If you want to add some questions you can simply open as issue `here `_. How could you Optimize memory usage for long PDFs ? --------------------------------------------------- In order to optimize memory usage you need to flush tables every ``n`` pages. For more information refer this snippet of code from `@anakin87 `_. .. code-block:: python3 # These long PDF contain regional balance sheets. # Code (chunked extraction) is similar to this: from main import logger import camelot import shutil import pathlib import json import os import glob def chunks(l, n): """Yield successive n-sized chunks from l.""" for i in range(0, len(l), n): yield l[i:i + n] def extract_tables_from_path(filename,pages,params=None): std_params_set = { 'flavor': 'lattice', 'line_scale': 65, 'copy_text': ['h', 'v'], 'split_text': True } # keys to export in JSON selected_keys = ['rows', 'whitespace', '_bbox', 'cols', 'page', 'shape', 'flavor', 'order', 'accuracy'] logger.info('\n\n' + '*' * 50 + 'START' + '*' * 50) logger.info('WORKING ON FILE {}'.format(filename)) dir_name = filename.rpartition('/')[-1].rpartition('.')[0] dir_path = '/comuni-data/exp_tables/' + dir_name dir_temp=dir_path+'/temp' # Clean and recreate output directory try: shutil.rmtree(dir_path, ignore_errors=True) pathlib.Path(dir_temp).mkdir(parents=True, exist_ok=True) except: logger.exception('Error in cleaning/creating output directory') return None params_set = params if params else std_params_set logger.info('USING THE FOLLOWING PARAMETERS: {}'.format(params_set)) # Control page number (by Camelot method) # and subdivide the extraction work into 50-pages parts. # AT THE END OF EVERY PART, SAVE THE DATA ON DISk AND FREE RAM handler=camelot.handlers.PDFHandler(filename) pages_list=handler._get_pages(filename,pages=pages) pages_chunks=list(chunks(pages_list,50)) last_index=0 tot_tables=0 index=0 for chunk in pages_chunks: tables=[] new_table_list=[] pages_string=str(chunk).replace('[','').replace(']','') try: tables = camelot.read_pdf(filename, pages=pages_string, **params_set) except Exception as e: logger.exception('ERROR IN TABLE EXTRACTION') return None # First filter new_table_list =[table for table in tables if table.shape != (1, 1)] # Second filter new_table_list = [table for table in new_table_list if (table.parsing_report['accuracy'] > 75 \ or table.parsing_report['accuracy'] < 0) \ and table.parsing_report['whitespace'] < 80\ and '(cid:' not in str(table.data)] logger.info('SAVING EXTRACTION') # Exports in JSON the selected fields tables_bboxes = [] for index, table in enumerate(new_table_list,last_index): table_dict = {key: table.__dict__[key] for key in selected_keys} table_dict['id'] = index table_dict['filepath'] = filename table_dict['json_data'] = table.__dict__['df'].to_json() table_filename = '{}/table-{}.json'.format(dir_path, index) with open(table_filename, "w") as file: json.dump(table_dict, file) last_index=index tot_tables+=len(new_table_list) logger.info('{} VALID TABLES DETECTED'.format(tot_tables)) logger.info('*' * 50 + 'END' + '*' * 50) api_response=ApiResponse(n_of_valid_tables=tot_tables,output_directory=str(pathlib.Path(dir_path).resolve())) return api_response