Fetching online data

The snippets below show how to download a binary file, and how to fetch and parse JSON.

Note: on Python 2 you would import urllib instead of urllib.request.

import json
import urllib.request
import http
import os

def fetch_json(url):
    with urlopen(url) as response:
        return json.loads(response.read().decode())

def download(url, target_file):
    # This works for HTTP only:
    #urllib.request.urlretrieve(url, target_file)
    with urlopen(url, cafile=certifi.where()) as response:
        content = response.read()
        with open(target_file, "wb") as f:            

Below is a longer example that shows:

  • how to customize the request,
  • how to read mime-types from the response
  • and which errors might be thrown as a result.

This example fetches the url using the HEAD method first to check for mime-type, then downloads the image using the method above if the file is an image.

def download_image(url, path, filename):
        # Perform the request using the HEAD method
        req = Request(url=url, headers={}, method='HEAD')
        with urlopen(req) as head:
            if head.headers.get_content_maintype() == 'image':
                subtype = head.headers.get_content_subtype()

                # Let the mime type determine the file extension
                ext = None
                if subtype == 'png':
                    ext = '.png'
                elif subtype == 'jpeg':
                    ext = '.jpg'

                if not ext is None:
                    output = os.path.join(path, filename) + ext
                    if not os.path.exists(output):
                        print(f'Downloading {url} to {output}')
                        download_url(url, output)
                        return True
    except TimeoutError as e:
        print(f'Error loading {url}: {e}')
    except UnicodeEncodeError as e:
        print(f'Unicode url not supported: {url} {e}')
    except ConnectionResetError as e:
        print(f'Error loading {url}: {e}')
    except http.client.RemoteDisconnected as e:
        print(f'Error loading {url}: {e}')
    except http.client.InvalidURL as e:
        print(f'Error loading {url}: {e}')
    except urllib.error.URLError as e:
        print(f'Error loading {url}: {e}')
    return False