Update faq
parent
ba5be43005
commit
2aaa913c40
|
|
@ -109,6 +109,7 @@ This part of the documentation begins with some background information about why
|
||||||
user/install
|
user/install
|
||||||
user/how-it-works
|
user/how-it-works
|
||||||
user/quickstart
|
user/quickstart
|
||||||
|
user/faq
|
||||||
user/advanced
|
user/advanced
|
||||||
user/cli
|
user/cli
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1,46 +1,51 @@
|
||||||
.. _faq:
|
.. _faq:
|
||||||
|
|
||||||
FAQ
|
Frequently Asked Questions
|
||||||
===
|
==========================
|
||||||
|
|
||||||
This part of the documentation answers some common questions. If you want to add some questions you can simply open an issue `here <https://github.com/camelot-dev/camelot/issues/new>`_.
|
|
||||||
|
|
||||||
|
This part of the documentation answers some common questions. To add questions, please open an issue `here <https://github.com/camelot-dev/camelot/issues/new>`_.
|
||||||
|
|
||||||
How to reduce memory usage for long PDFs?
|
How to reduce memory usage for long PDFs?
|
||||||
---------------------------------------------------
|
-----------------------------------------
|
||||||
|
|
||||||
During table extraction from long PDF documents, RAM usage can grow significantly.
|
During table extraction from long PDF documents, RAM usage can grow significantly.
|
||||||
|
|
||||||
A simple workaround is to divide the extraction into some chunks (for example, chunks of 50 pages); at the end of every chunk extraction, data are saved to disk.
|
A simple workaround is to divide the extraction into chunks, and save extracted data to disk at the end of every chunk.
|
||||||
|
|
||||||
For more information, refer to this code snippet from `@anakin87 <https://github.com/anakin87>`_:
|
For more details, check out this code snippet from `@anakin87 <https://github.com/anakin87>`_:
|
||||||
|
|
||||||
.. code-block:: python3
|
::
|
||||||
|
|
||||||
import camelot
|
import camelot
|
||||||
|
|
||||||
|
|
||||||
def chunks(l, n):
|
def chunks(l, n):
|
||||||
"""Yield successive n-sized chunks from l."""
|
"""Yield successive n-sized chunks from l."""
|
||||||
for i in range(0, len(l), n):
|
for i in range(0, len(l), n):
|
||||||
yield l[i : i + n]
|
yield l[i : i + n]
|
||||||
|
|
||||||
def extract_tables_with_less_memory_usage(filepath, pages, params={},
|
|
||||||
export_path='.', chunks_length=50):
|
def extract_tables(filepath, pages, chunks=50, export_path=".", params={}):
|
||||||
"""
|
"""
|
||||||
Control page number
|
Divide the extraction work into n chunks. At the end of every chunk,
|
||||||
and subdivide the extraction work into n-pages parts (chunks_length).
|
save data on disk and free RAM.
|
||||||
At the end of every part, save the data on disk and free ram
|
|
||||||
|
filepath : str
|
||||||
|
Filepath or URL of the PDF file.
|
||||||
|
pages : str, optional (default: '1')
|
||||||
|
Comma-separated page numbers.
|
||||||
|
Example: '1,3,4' or '1,4-end' or 'all'.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
# get list of document pages from Camelot handler
|
# get list of pages from camelot.handlers.PDFHandler
|
||||||
handler = camelot.handlers.PDFHandler(filepath)
|
handler = camelot.handlers.PDFHandler(filepath)
|
||||||
pages_list=handler._get_pages(filepath,pages=pages)
|
page_list = handler._get_pages(filepath, pages=pages)
|
||||||
|
|
||||||
# chunk pages list
|
# chunk pages list
|
||||||
pages_chunks=list(chunks(pages_list,chunks_length))
|
page_chunks = list(chunks(page_list, chunks))
|
||||||
|
|
||||||
# extraction and export
|
# extraction and export
|
||||||
for chunk in pages_chunks:
|
for chunk in page_chunks:
|
||||||
pages_string=str(chunk).replace('[','').replace(']','')
|
pages_string = str(chunk).replace("[", "").replace("]", "")
|
||||||
tables = camelot.read_pdf(filepath, pages=pages_string, **params)
|
tables = camelot.read_pdf(filepath, pages=pages_string, **params)
|
||||||
tables.export(f'{export_path}/tables.json',f='json')
|
tables.export(f"{export_path}/tables.csv")
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue