Add zip of csvs option

pull/2/head
Vinayak Mehta 2017-04-11 14:14:54 +05:30
parent 72233f25ce
commit 8e8f5bbb3b
1 changed files with 26 additions and 14 deletions

View File

@ -1,11 +1,13 @@
#!/usr/bin/env python2 #!/usr/bin/env python2
from __future__ import print_function from __future__ import print_function
import os import os
import csv
import sys import sys
import glob import glob
import time import time
import zipfile import zipfile
import warnings import warnings
import cStringIO
import numpy as np import numpy as np
from docopt import docopt from docopt import docopt
@ -32,7 +34,7 @@ options:
-p, --pages <pageno> Comma-separated list of page numbers. -p, --pages <pageno> Comma-separated list of page numbers.
Example: -p 1,3-6,10 [default: 1] Example: -p 1,3-6,10 [default: 1]
-P, --parallel Parallelize the parsing process. -P, --parallel Parallelize the parsing process.
-f, --format <format> Output format. (csv,tsv,html,json,xlsx) [default: csv] -f, --format <format> Output format. (csv,tsv,zip,html,json,xlsx) [default: csv]
-l, --log <logfile> Log to file. -l, --log <logfile> Log to file.
-o, --output <directory> Output directory. -o, --output <directory> Output directory.
-M, --cmargin <cmargin> Char margin. Chars closer than cmargin are -M, --cmargin <cmargin> Char margin. Chars closer than cmargin are
@ -290,9 +292,9 @@ def write_to_disk(data, f='csv', output=None, filename=None):
fname = os.path.basename(filename) fname = os.path.basename(filename)
froot, __ = os.path.splitext(fname) froot, __ = os.path.splitext(fname)
if f in ['csv', 'tsv']: if f in ['csv', 'tsv']:
import csv
delimiter = ',' if f == 'csv' else '\t' delimiter = ',' if f == 'csv' else '\t'
for page_number in sorted(data.keys()): for page_number in sorted(data.keys()):
if data[page_number] is not None:
for table_number in sorted(data[page_number].keys()): for table_number in sorted(data[page_number].keys()):
dsvname = '{0}.{1}'.format(''.join([page_number, '_', table_number]), f) dsvname = '{0}.{1}'.format(''.join([page_number, '_', table_number]), f)
with open(os.path.join(output, dsvname), 'w') as outfile: with open(os.path.join(output, dsvname), 'w') as outfile:
@ -300,11 +302,21 @@ def write_to_disk(data, f='csv', output=None, filename=None):
outfile, delimiter=delimiter, quoting=csv.QUOTE_ALL) outfile, delimiter=delimiter, quoting=csv.QUOTE_ALL)
for row in data[page_number][table_number]['data']: for row in data[page_number][table_number]['data']:
writer.writerow(row) writer.writerow(row)
csv_glob = glob.glob(os.path.join(output, '*.csv')) elif f == 'zip':
if len(csv_glob) > 1: csv_zip = os.path.join(output, '{0}.zip'.format(froot))
with zipfile.ZipFile(os.path.join(output, '{0}.zip'.format(froot)), 'w') as zfile: with zipfile.ZipFile(csv_zip, 'w', zipfile.ZIP_DEFLATED, allowZip64=True) \
for cfile in csv_glob: as zfile:
zfile.write(cfile, os.path.basename(cfile), zipfile.ZIP_DEFLATED) for page_number in sorted(data.keys()):
if data[page_number] is not None:
for table_number in sorted(data[page_number].keys()):
csvname = '{0}.{1}'.format(''.join([page_number, '_', table_number]), 'csv')
outfile = cStringIO.StringIO()
writer = csv.writer(
outfile, delimiter=',', quoting=csv.QUOTE_ALL)
for row in data[page_number][table_number]['data']:
writer.writerow(row)
zfile.writestr(csvname, outfile.getvalue())
outfile.close()
elif f == 'html': elif f == 'html':
htmlname = '{0}.html'.format(froot) htmlname = '{0}.html'.format(froot)
for page_number in sorted(data.keys()): for page_number in sorted(data.keys()):