From 57bce5ce554adb6f58c004ddddcb4b081823cf0f Mon Sep 17 00:00:00 2001 From: nightwarrior-xxx Date: Mon, 14 Oct 2019 14:27:43 +0530 Subject: [PATCH 1/3] Added a section for faq --- docs/user/faq.rst | 6 ++++++ 1 file changed, 6 insertions(+) create mode 100644 docs/user/faq.rst diff --git a/docs/user/faq.rst b/docs/user/faq.rst new file mode 100644 index 0000000..182054b --- /dev/null +++ b/docs/user/faq.rst @@ -0,0 +1,6 @@ +.. _faq: + +FAQ +=== + +How an accuracy is calculated ? From 8e69c75c9c8b7a5a033e59ca7366c62853717419 Mon Sep 17 00:00:00 2001 From: nightwarrior-xxx Date: Mon, 14 Oct 2019 14:31:20 +0530 Subject: [PATCH 2/3] Added a section for faq --- docs/user/faq.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/user/faq.rst b/docs/user/faq.rst index 182054b..85b632f 100644 --- a/docs/user/faq.rst +++ b/docs/user/faq.rst @@ -4,3 +4,4 @@ FAQ === How an accuracy is calculated ? +------------------------------- \ No newline at end of file From 477632bee8beae80932105dcee98d0e4e9f1d7ad Mon Sep 17 00:00:00 2001 From: nightwarrior-xxx Date: Wed, 16 Oct 2019 20:30:18 +0530 Subject: [PATCH 3/3] Added a question and answer in FAQ for optimizing the memory usage --- docs/user/faq.rst | 147 +++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 145 insertions(+), 2 deletions(-) diff --git a/docs/user/faq.rst b/docs/user/faq.rst index 85b632f..aef1531 100644 --- a/docs/user/faq.rst +++ b/docs/user/faq.rst @@ -3,5 +3,148 @@ FAQ === -How an accuracy is calculated ? -------------------------------- \ No newline at end of file +This part of the documentation answers some common questions. If you want to add some questions you can simply open as issue `here `_. + + +How could you Optimize memory usage for long PDFs ? +--------------------------------------------------- + + +In order to optimize memory usage you need to flush tables every ``n`` pages. For more information refer this snippet of code from `@anakin87 `_. + +.. code-block:: python3 + + + # These long PDF contain regional balance sheets. + # Code (chunked extraction) is similar to this: + + from main import logger + import camelot + import shutil + import pathlib + import json + import os + import glob + + def chunks(l, n): + """Yield successive n-sized chunks from l.""" + for i in range(0, len(l), n): + yield l[i:i + n] + + def extract_tables_from_path(filename,pages,params=None): + + std_params_set = { + 'flavor': 'lattice', + 'line_scale': 65, + 'copy_text': ['h', 'v'], + 'split_text': True + } + + # keys to export in JSON + selected_keys = ['rows', + 'whitespace', + '_bbox', + 'cols', + 'page', + 'shape', + 'flavor', + 'order', + 'accuracy'] + + logger.info('\n\n' + '*' * 50 + 'START' + '*' * 50) + logger.info('WORKING ON FILE {}'.format(filename)) + + + dir_name = filename.rpartition('/')[-1].rpartition('.')[0] + dir_path = '/comuni-data/exp_tables/' + dir_name + dir_temp=dir_path+'/temp' + + # Clean and recreate output directory + try: + shutil.rmtree(dir_path, ignore_errors=True) + pathlib.Path(dir_temp).mkdir(parents=True, exist_ok=True) + except: + logger.exception('Error in cleaning/creating output directory') + return None + + params_set = params if params else std_params_set + + logger.info('USING THE FOLLOWING PARAMETERS: {}'.format(params_set)) + + + # Control page number (by Camelot method) + # and subdivide the extraction work into 50-pages parts. + # AT THE END OF EVERY PART, SAVE THE DATA ON DISk AND FREE RAM + + handler=camelot.handlers.PDFHandler(filename) + pages_list=handler._get_pages(filename,pages=pages) + + pages_chunks=list(chunks(pages_list,50)) + + last_index=0 + tot_tables=0 + index=0 + + + + for chunk in pages_chunks: + tables=[] + new_table_list=[] + + pages_string=str(chunk).replace('[','').replace(']','') + + + + try: + tables = camelot.read_pdf(filename, pages=pages_string, **params_set) + except Exception as e: + logger.exception('ERROR IN TABLE EXTRACTION') + return None + + + + + # First filter + new_table_list =[table for table in tables if table.shape != (1, 1)] + + # Second filter + + new_table_list = [table for table in new_table_list if (table.parsing_report['accuracy'] > 75 \ + or table.parsing_report['accuracy'] < 0) \ + and table.parsing_report['whitespace'] < 80\ + and '(cid:' not in str(table.data)] + + + + logger.info('SAVING EXTRACTION') + + # Exports in JSON the selected fields + tables_bboxes = [] + + for index, table in enumerate(new_table_list,last_index): + table_dict = {key: table.__dict__[key] for key in selected_keys} + + table_dict['id'] = index + table_dict['filepath'] = filename + table_dict['json_data'] = table.__dict__['df'].to_json() + + table_filename = '{}/table-{}.json'.format(dir_path, index) + + + with open(table_filename, "w") as file: + json.dump(table_dict, file) + + + last_index=index + tot_tables+=len(new_table_list) + + + + + logger.info('{} VALID TABLES DETECTED'.format(tot_tables)) + logger.info('*' * 50 + 'END' + '*' * 50) + + api_response=ApiResponse(n_of_valid_tables=tot_tables,output_directory=str(pathlib.Path(dir_path).resolve())) + + + return api_response