Source code for astroML.datasets.dr7_quasar

"""
SDSS DR7 Quasar Dataset Loader.

This implements a loader for the DR7 quasar dataset, located at
http://www.sdss.org/dr7/products/value added/qsocat_dr7.html
"""
import os
from gzip import GzipFile
from io import BytesIO

import numpy as np

from .tools import download_with_progress_bar
from . import get_data_home

DATA_URL = 'http://das.sdss.org/va/qsocat/dr7qso.dat.gz'

ARCHIVE_FILE = 'dr7_quasar.npy'

# column numbers for extraction
DR7_DTYPE = [('sdssID', 'a14'),
             ('RA', 'f8'),
             ('dec', 'f8'),
             ('redshift', 'f4'),
             ('mag_u', 'f4'),
             ('err_u', 'f4'),
             ('mag_g', 'f4'),
             ('err_g', 'f4'),
             ('mag_r', 'f4'),
             ('err_r', 'f4'),
             ('mag_i', 'f4'),
             ('err_i', 'f4'),
             ('mag_z', 'f4'),
             ('err_z', 'f4'),
             ('mag_J', 'f4'),
             ('err_J', 'f4'),
             ('mag_H', 'f4'),
             ('err_H', 'f4'),
             ('mag_K', 'f4'),
             ('err_K', 'f4'),
             ('specobjid', 'i8')]

COLUMN_NUMBERS = [0, 1, 2, 3,
                  4, 5, 6, 7,
                  8, 9, 10, 11, 12, 13,
                  22, 23, 24, 25, 26, 27, 72]

# length of header information
SKIP_ROWS = 80


[docs]def fetch_dr7_quasar(data_home=None, download_if_missing=True): """Loader for SDSS DR7 quasar catalog Parameters ---------- data_home : optional, default=None Specify another download and cache folder for the datasets. By default all astroML data is stored in '~/astroML_data'. download_if_missing : optional, default=True If False, raise a IOError if the data is not locally available instead of trying to download the data from the source site. Returns ------- data : ndarray, shape = (105783,) numpy record array containing the quasar catalog Examples -------- >>> from astroML.datasets import fetch_dr7_quasar >>> data = fetch_dr7_quasar() # doctest: +IGNORE_OUTPUT +REMOTE_DATA >>> u_g = data['mag_u'] - data['mag_g'] # doctest: +REMOTE_DATA >>> u_g[:3] # first three u-g colors # doctest: +REMOTE_DATA array([-0.07699966, 0.03600121, 0.10900116], dtype=float32) Notes ----- Not all available data is extracted and saved. The extracted columns are: sdssID, RA, DEC, redshift, mag_u, err_u, mag_g, err_g, mag_r, err_r, mag_i, err_i, mag_z, err_z, mag_J, err_J, mag_H, err_H, mag_K, err_K, specobjid many of the objects are missing 2mass photometry. More information at http://www.sdss.org/dr7/products/value_added/qsocat_dr7.html """ data_home = get_data_home(data_home) archive_file = os.path.join(data_home, ARCHIVE_FILE) if not os.path.exists(archive_file): if not download_if_missing: raise IOError('data not present on disk. ' 'set download_if_missing=True to download') print("downloading DR7 quasar dataset from %s to %s" % (DATA_URL, data_home)) zipped_buf = download_with_progress_bar(DATA_URL, return_buffer=True) gzf = GzipFile(fileobj=zipped_buf, mode='rb') extracted_buf = BytesIO(gzf.read()) data = np.loadtxt(extracted_buf, skiprows=SKIP_ROWS, usecols=COLUMN_NUMBERS, dtype=DR7_DTYPE) np.save(archive_file, data) else: data = np.load(archive_file) return data