From 5c3a686ebedc933e9055ba9ea947d5931fdfde47 Mon Sep 17 00:00:00 2001 From: anakin87 <44616784+anakin87@users.noreply.github.com> Date: Tue, 8 Dec 2020 18:57:41 +0100 Subject: [PATCH 1/2] Introduce Faq Introduced faq. Started with reducing memory usage. --- docs/user/faq.rst | 46 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) create mode 100644 docs/user/faq.rst diff --git a/docs/user/faq.rst b/docs/user/faq.rst new file mode 100644 index 0000000..8f3b59e --- /dev/null +++ b/docs/user/faq.rst @@ -0,0 +1,46 @@ +.. _faq: + +FAQ +=== + +This part of the documentation answers some common questions. If you want to add some questions you can simply open an issue `here `_. + + +How to reduce memory usage for long PDFs? +--------------------------------------------------- + +During table extraction from long PDF documents, RAM usage can grow significantly. + +A simple workaround is to divide the extraction into some chunks (for example, chunks of 50 pages); at the end of every chunk extraction, data are saved to disk. + +For more information, refer to this code snippet from `@anakin87 `_: + +.. code-block:: python3 + + import camelot + + def chunks(l, n): + """Yield successive n-sized chunks from l.""" + for i in range(0, len(l), n): + yield l[i:i + n] + + def extract_tables_with_less_memory_usage(filepath, pages, params={}, + export_path='.', chunks_length=50): + """ + Control page number + and subdivide the extraction work into n-pages parts (chunks_length). + At the end of every part, save the data on disk and free ram + """ + + # get list of document pages from Camelot handler + handler=camelot.handlers.PDFHandler(filepath) + pages_list=handler._get_pages(filepath,pages=pages) + + # chunk pages list + pages_chunks=list(chunks(pages_list,chunks_length)) + + # extraction and export + for chunk in pages_chunks: + pages_string=str(chunk).replace('[','').replace(']','') + tables = camelot.read_pdf(filepath, pages=pages_string,**params) + tables.export(f'{export_path}/tables.json',f='json') From 2aaa913c401e9d2dd7f599aa9e4c2b1c19e943d7 Mon Sep 17 00:00:00 2001 From: Vinayak Mehta Date: Mon, 28 Jun 2021 00:15:43 +0530 Subject: [PATCH 2/2] Update faq --- docs/index.rst | 1 + docs/user/faq.rst | 61 +++++++++++++++++++++++++---------------------- 2 files changed, 34 insertions(+), 28 deletions(-) diff --git a/docs/index.rst b/docs/index.rst index c3e1de4..b15fe33 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -109,6 +109,7 @@ This part of the documentation begins with some background information about why user/install user/how-it-works user/quickstart + user/faq user/advanced user/cli diff --git a/docs/user/faq.rst b/docs/user/faq.rst index 8f3b59e..8081083 100644 --- a/docs/user/faq.rst +++ b/docs/user/faq.rst @@ -1,46 +1,51 @@ .. _faq: -FAQ -=== - -This part of the documentation answers some common questions. If you want to add some questions you can simply open an issue `here `_. +Frequently Asked Questions +========================== +This part of the documentation answers some common questions. To add questions, please open an issue `here `_. How to reduce memory usage for long PDFs? ---------------------------------------------------- +----------------------------------------- During table extraction from long PDF documents, RAM usage can grow significantly. - -A simple workaround is to divide the extraction into some chunks (for example, chunks of 50 pages); at the end of every chunk extraction, data are saved to disk. -For more information, refer to this code snippet from `@anakin87 `_: +A simple workaround is to divide the extraction into chunks, and save extracted data to disk at the end of every chunk. -.. code-block:: python3 +For more details, check out this code snippet from `@anakin87 `_: + +:: import camelot - + + def chunks(l, n): """Yield successive n-sized chunks from l.""" for i in range(0, len(l), n): - yield l[i:i + n] - - def extract_tables_with_less_memory_usage(filepath, pages, params={}, - export_path='.', chunks_length=50): + yield l[i : i + n] + + + def extract_tables(filepath, pages, chunks=50, export_path=".", params={}): """ - Control page number - and subdivide the extraction work into n-pages parts (chunks_length). - At the end of every part, save the data on disk and free ram + Divide the extraction work into n chunks. At the end of every chunk, + save data on disk and free RAM. + + filepath : str + Filepath or URL of the PDF file. + pages : str, optional (default: '1') + Comma-separated page numbers. + Example: '1,3,4' or '1,4-end' or 'all'. """ - - # get list of document pages from Camelot handler - handler=camelot.handlers.PDFHandler(filepath) - pages_list=handler._get_pages(filepath,pages=pages) - + + # get list of pages from camelot.handlers.PDFHandler + handler = camelot.handlers.PDFHandler(filepath) + page_list = handler._get_pages(filepath, pages=pages) + # chunk pages list - pages_chunks=list(chunks(pages_list,chunks_length)) - + page_chunks = list(chunks(page_list, chunks)) + # extraction and export - for chunk in pages_chunks: - pages_string=str(chunk).replace('[','').replace(']','') - tables = camelot.read_pdf(filepath, pages=pages_string,**params) - tables.export(f'{export_path}/tables.json',f='json') + for chunk in page_chunks: + pages_string = str(chunk).replace("[", "").replace("]", "") + tables = camelot.read_pdf(filepath, pages=pages_string, **params) + tables.export(f"{export_path}/tables.csv")