From 57bce5ce554adb6f58c004ddddcb4b081823cf0f Mon Sep 17 00:00:00 2001
From: nightwarrior-xxx <amanv1999@gmail.com>
Date: Mon, 14 Oct 2019 14:27:43 +0530
Subject: [PATCH 1/3] Added a section for faq

---
 docs/user/faq.rst | 6 ++++++
 1 file changed, 6 insertions(+)
 create mode 100644 docs/user/faq.rst

diff --git a/docs/user/faq.rst b/docs/user/faq.rst
new file mode 100644
index 0000000..182054b
--- /dev/null
+++ b/docs/user/faq.rst
@@ -0,0 +1,6 @@
+.. _faq:
+
+FAQ
+===
+
+How an accuracy is calculated ?

From 8e69c75c9c8b7a5a033e59ca7366c62853717419 Mon Sep 17 00:00:00 2001
From: nightwarrior-xxx <amanv1999@gmail.com>
Date: Mon, 14 Oct 2019 14:31:20 +0530
Subject: [PATCH 2/3] Added a section for faq

---
 docs/user/faq.rst | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/user/faq.rst b/docs/user/faq.rst
index 182054b..85b632f 100644
--- a/docs/user/faq.rst
+++ b/docs/user/faq.rst
@@ -4,3 +4,4 @@ FAQ
 ===
 
 How an accuracy is calculated ?
+-------------------------------
\ No newline at end of file

From 477632bee8beae80932105dcee98d0e4e9f1d7ad Mon Sep 17 00:00:00 2001
From: nightwarrior-xxx <amanv1999@gmail.com>
Date: Wed, 16 Oct 2019 20:30:18 +0530
Subject: [PATCH 3/3] Added a question and answer in FAQ for optimizing the
 memory usage

---
 docs/user/faq.rst | 147 +++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 145 insertions(+), 2 deletions(-)

diff --git a/docs/user/faq.rst b/docs/user/faq.rst
index 85b632f..aef1531 100644
--- a/docs/user/faq.rst
+++ b/docs/user/faq.rst
@@ -3,5 +3,148 @@
 FAQ
 ===
 
-How an accuracy is calculated ?
--------------------------------
\ No newline at end of file
+This part of the documentation answers some common questions. If you want to add some questions you can simply open as issue `here <https://github.com/camelot-dev/camelot/issues/new>`_.
+
+
+How could you Optimize memory usage for long PDFs ?
+---------------------------------------------------
+
+
+In order to optimize memory usage you need to flush tables every ``n`` pages. For more information refer this snippet of code from `@anakin87 <https://github.com/anakin87>`_.
+
+.. code-block:: python3
+
+
+        # These long PDF contain regional balance sheets.
+        # Code (chunked extraction) is similar to this:
+
+        from main import logger
+        import camelot
+        import shutil
+        import pathlib
+        import json
+        import os
+        import glob
+
+        def chunks(l, n):
+        """Yield successive n-sized chunks from l."""
+        for i in range(0, len(l), n):
+        yield l[i:i + n]
+
+        def extract_tables_from_path(filename,pages,params=None):
+
+        std_params_set = {
+            'flavor': 'lattice',
+            'line_scale': 65,
+            'copy_text': ['h', 'v'],
+            'split_text': True
+        }
+
+        # keys to export in JSON
+        selected_keys = ['rows',
+                        'whitespace',
+                        '_bbox',
+                        'cols',
+                        'page',
+                        'shape',
+                        'flavor',
+                        'order',
+                        'accuracy']
+
+        logger.info('\n\n' + '*' * 50 + 'START' + '*' * 50)
+        logger.info('WORKING ON FILE {}'.format(filename))
+
+
+        dir_name = filename.rpartition('/')[-1].rpartition('.')[0]
+        dir_path = '/comuni-data/exp_tables/' + dir_name
+        dir_temp=dir_path+'/temp'
+
+        # Clean and recreate output directory
+        try:
+            shutil.rmtree(dir_path, ignore_errors=True)
+            pathlib.Path(dir_temp).mkdir(parents=True, exist_ok=True)
+        except:
+            logger.exception('Error in cleaning/creating output directory')
+            return None  
+
+        params_set = params if params else std_params_set
+
+        logger.info('USING THE FOLLOWING PARAMETERS: {}'.format(params_set))        
+
+
+        # Control page number (by Camelot method)
+        # and subdivide the extraction work into 50-pages parts.
+        # AT THE END OF EVERY PART, SAVE THE DATA ON DISk AND FREE RAM
+
+        handler=camelot.handlers.PDFHandler(filename)
+        pages_list=handler._get_pages(filename,pages=pages)
+
+        pages_chunks=list(chunks(pages_list,50))
+
+        last_index=0
+        tot_tables=0
+        index=0
+
+
+
+        for chunk in pages_chunks:
+            tables=[]
+            new_table_list=[]
+
+            pages_string=str(chunk).replace('[','').replace(']','')
+
+
+
+            try:
+                tables = camelot.read_pdf(filename, pages=pages_string, **params_set)
+            except Exception as e:
+                logger.exception('ERROR IN TABLE EXTRACTION')
+                return None
+
+
+
+
+            # First filter      
+            new_table_list =[table for table in tables if table.shape != (1, 1)]
+
+            # Second filter
+
+            new_table_list = [table for table in new_table_list if (table.parsing_report['accuracy'] > 75 \
+                            or table.parsing_report['accuracy'] < 0) \
+                            and table.parsing_report['whitespace'] < 80\
+                            and '(cid:' not in str(table.data)]
+
+
+
+            logger.info('SAVING EXTRACTION')
+
+            # Exports in JSON the selected fields
+            tables_bboxes = []
+
+            for index, table in enumerate(new_table_list,last_index):
+                table_dict = {key: table.__dict__[key] for key in selected_keys}
+
+                table_dict['id'] = index
+                table_dict['filepath'] = filename
+                table_dict['json_data'] = table.__dict__['df'].to_json()
+
+                table_filename = '{}/table-{}.json'.format(dir_path, index)
+
+
+                with open(table_filename, "w") as file:
+                    json.dump(table_dict, file)
+
+                
+            last_index=index
+            tot_tables+=len(new_table_list)
+
+
+            
+
+        logger.info('{} VALID TABLES DETECTED'.format(tot_tables))
+        logger.info('*' * 50 + 'END' + '*' * 50)        
+
+        api_response=ApiResponse(n_of_valid_tables=tot_tables,output_directory=str(pathlib.Path(dir_path).resolve()))
+
+
+        return api_response