Source code for dnppy.download.download_urls

__author__ = 'jwely'

from dnppy import core
from download_url import download_url
import os, time

__all__ = ["download_urls"]

[docs]def download_urls(url_list, outdir, file_types = None):
    """
    Downloads a list of files. Retries failed downloads

    This script downloads a list of files and places it in the output directory. It was
    built to be nested within "Download_filelist" to allow loops to continuously retry
    failed files until they are successful or a retry limit is reached.

    :param url_list:   array of urls, probably as read from a text file
    :param file_types: list of file types to download. Useful for excluding extraneous
                       metadata by only downloading 'hdf' or 'tif' for example. Please note
                       that often times, you actually NEED the metadata.
    :param outdir:     folder where files are to be placed after download

    :return failed:    list of files which failed download
    """

    failed   = []
    url_list = core.enf_list(url_list)

    # creates output folder at desired path if it doesn't already exist
    if not os.path.exists(outdir):
        os.makedirs(outdir)

    # establish a wait time that will increase when downloads fail. This helps to reduce
    # the frequency of REVERB server rejections for requesting too many downloads
    wait = 0

    for site in url_list:
        download = False
        url      = site.rstrip()
        sub      = url.split("/")
        leng     = len(sub)
        name     = sub[leng-1]

        # Determine whether or not to download the file based on filetype.
        if file_types is not None:
            for filetype in file_types:
                if filetype in name[-4:]:
                    download = True
        else:
            download = True

        # attempt download of the file, or skip it.
        if download:

            try:
                # wait for the wait time before attempting writing a file
                time.sleep(wait)
                download_url(url, os.path.join(outdir,name))
                print("{0} is downloaded {1}".format(name, wait))

                # reduce the wait time when downloads succeed.
                if wait >= 1:
                    wait -= wait

            # add to the fail count if the download is unsuccessful and wait longer next time.
            except:
                print("{0} will be retried! {1}".format(sub[leng-1], wait))
                wait += 5
                failed.append(url)

    print("Finished downloading urls!")
    return failed