Downloading and unzipping a .zip file without writing to disk

Question 1

I have managed to get my first python script to work which downloads a list of .ZIP files from a URL and then proceeds to extract the ZIP files and writes them to disk.

I am now at a loss to achieve the next step.

My primary goal is to download and extract the zip file and pass the contents (CSV data) via a TCP stream. I would prefer not to actually write any of the zip or extracted files to disk if I could get away with it.

Here is my current script which works but unfortunately has to write the files to disk.

import urllib, urllister
import zipfile
import urllib2
import os
import time
import pickle

# check for extraction directories existence
if not os.path.isdir('downloaded'):
    os.makedirs('downloaded')

if not os.path.isdir('extracted'):
    os.makedirs('extracted')

# open logfile for downloaded data and save to local variable
if os.path.isfile('downloaded.pickle'):
    downloadedLog = pickle.load(open('downloaded.pickle'))
else:
    downloadedLog = {'key':'value'}

# remove entries older than 5 days (to maintain speed)

# path of zip files
zipFileURL = "http://www.thewebserver.com/that/contains/a/directory/of/zip/files"

# retrieve list of URLs from the webservers
usock = urllib.urlopen(zipFileURL)
parser = urllister.URLLister()
parser.feed(usock.read())
usock.close()
parser.close()

# only parse urls
for url in parser.urls: 
    if "PUBLIC_P5MIN" in url:

        # download the file
        downloadURL = zipFileURL + url
        outputFilename = "downloaded/" + url

        # check if file already exists on disk
        if url in downloadedLog or os.path.isfile(outputFilename):
            print "Skipping " + downloadURL
            continue

        print "Downloading ",downloadURL
        response = urllib2.urlopen(downloadURL)
        zippedData = response.read()

        # save data to disk
        print "Saving to ",outputFilename
        output = open(outputFilename,'wb')
        output.write(zippedData)
        output.close()

        # extract the data
        zfobj = zipfile.ZipFile(outputFilename)
        for name in zfobj.namelist():
            uncompressed = zfobj.read(name)

            # save uncompressed data to disk
            outputFilename = "extracted/" + name
            print "Saving extracted file to ",outputFilename
            output = open(outputFilename,'wb')
            output.write(uncompressed)
            output.close()

            # send data via tcp stream

            # file successfully downloaded and extracted store into local log and filesystem log
            downloadedLog[url] = time.time();
            pickle.dump(downloadedLog, open('downloaded.pickle', "wb" ))

Question 2

My suggestion would be to use a StringIO object. They emulate files, but reside in memory. So you could do something like this:

# get_zip_data() gets a zip archive containing 'foo.txt', reading 'hey, foo'

import zipfile
from StringIO import StringIO

zipdata = StringIO()
zipdata.write(get_zip_data())
myzipfile = zipfile.ZipFile(zipdata)
foofile = myzipfile.open('foo.txt')
print foofile.read()

# output: "hey, foo"

Or more simply (apologies to Vishal):

myzipfile = zipfile.ZipFile(StringIO(get_zip_data()))
for name in myzipfile.namelist():
    [ ... ]

In Python 3 use BytesIO instead of StringIO:

import zipfile
from io import BytesIO

filebytes = BytesIO(get_zip_data())
myzipfile = zipfile.ZipFile(filebytes)
for name in myzipfile.namelist():
    [ ... ]

Downloading and unzipping a .zip file without writing to disk

Mangs

Answer a question

Answers

所有评论(0)

Mangs