""" Applying selection functions to catalog """
import os
import numpy as np
from ceci.config import StageParameter as Param
from rail.creation.degrader import Degrader
from scipy.interpolate import interp1d
from rail.core.utils import RAILDIR
[docs]class SpecSelection(Degrader):
"""
The super class of spectroscopic selections.
Parameters
----------
N_tot: integer
total number of down-sampled, spec-selected galaxies.
If N_tot is greater than the number of spec-sepected galaxies, then
it will be ignored.
nondetect_val: value to be removed for non detects
downsample: bool
If True, then downsample the pre-selected galaxies
to N_tot galaxies.
success_rate_dir: string, the path to the success rate files.
percentile_cut: If using color-based redshift cut, percentile in redshifts above which redshifts will be cut from the sample. Default is 100 (no cut)
colnames: a dictionary that includes necessary columns
(magnitudes, colors and redshift) for selection. For magnitudes, the keys are ugrizy; for colors, the keys are,
for example, gr standing for g-r; for redshift, the key is 'redshift'.
random_seed: random seed for reproducibility.
"""
name = "specselection"
config_options = Degrader.config_options.copy()
config_options.update(
N_tot=Param(int, 10000, msg="Number of selected sources"),
nondetect_val=Param(float, 99.0, msg="value to be removed for non detects"),
downsample=Param(
bool,
True,
msg="If true, downsample the selected sources into a total number of N_tot",
),
success_rate_dir=Param(
str,
os.path.join(RAILDIR,
"rail/examples_data/creation_data/data/success_rate_data",
),
msg="The path to the directory containing success rate files.",
),
percentile_cut=Param(int, 100, msg="cut redshifts above this percentile"),
colnames=Param(
dict,
{
**{band: "mag_" + band + "_lsst" for band in "ugrizy"},
**{"redshift": "redshift"},
},
msg="a dictionary that includes necessary columns\
(magnitudes, colors and redshift) for selection. For magnitudes, the keys are ugrizy; for colors, the keys are, \
for example, gr standing for g-r; for redshift, the key is 'redshift'",
),
random_seed=Param(int, 42, msg="random seed for reproducibility"),
)
def __init__(self, args, comm=None):
Degrader.__init__(self, args, comm=comm)
# validate the settings
self._validate_settings()
self.mask = None
self.rng = None
def _validate_settings(self):
"""
Validate all the settings.
"""
if self.config.N_tot < 0:
raise ValueError(
"Total number of selected sources must be a " "positive integer."
)
if os.path.exists(self.config.success_rate_dir) is not True:
raise ValueError(
"Success rate path: "
+ self.config.success_rate_dir
+ " does not exist!"
)
[docs] def validate_colnames(self, data):
"""
Validate the column names of data table to make sure they have necessary information
for each selection.
colnames: a list of column names.
"""
colnames = self.config.colnames.values()
check = all(item in data.columns for item in colnames)
if check is not True:
raise ValueError(
"Columns in the data are not enough for the selection."
+ "The data should contain "
+ str(list(colnames))
+ ". \n"
)
[docs] def selection(self, data):
"""
Selection functions. This should be overwritten by the subclasses
corresponding to different spec selections.
"""
[docs] def invalid_cut(self, data):
"""
This function removes entries in the data that have invalid magnitude values
(nondetect_val or NaN)
"""
nondetect_val = self.config.nondetect_val
for band in "ugrizy":
if band not in self.config.colnames.keys():
continue
colname = self.config.colnames[band]
self.mask &= (np.abs(data[colname]) < nondetect_val) & (
~np.isnan(data[colname])
)
[docs] def downsampling_N_tot(self):
"""
Method to randomly sample down the objects to a given
number of data objects.
"""
N_tot = self.config.N_tot
N_selected = np.count_nonzero(self.mask)
if N_tot > N_selected:
print(
"Warning: N_tot is greater than the size of spec-selected "
+ "sample ("
+ str(N_selected)
+ "). The spec-selected sample "
+ "is returned."
)
return
else:
idx_selected = np.where(self.mask)[0]
idx_keep = self.rng.choice(idx_selected, replace=False, size=N_tot)
# create a mask with only those entries enabled that have been
# selected
mask = np.zeros_like(self.mask)
mask[idx_keep] = True
# update the internal state
self.mask &= mask
[docs] def run(self):
"""
Run the selection
"""
self.rng = np.random.default_rng(seed=self.config.seed)
# get the bands and bandNames present in the data
data = self.get_data("input", allow_missing=True)
self.validate_colnames(data)
self.mask = np.product(~np.isnan(data.to_numpy()), axis=1)
self.invalid_cut(data)
self.selection(data)
if self.config.downsample is True:
self.downsampling_N_tot()
data_selected = data.iloc[np.where(self.mask == 1)[0]]
self.add_data("output", data_selected)
def __repr__(self):
"""
Define how the model is represented and printed.
"""
[docs]class SpecSelection_GAMA(SpecSelection):
"""
The class of spectroscopic selections with GAMA.
The GAMA survey covers an area of 286 deg^2, with ~238000 objects
The necessary column is r band
"""
name = "specselection_gama"
[docs] def selection(self, data):
"""
GAMA selection function
"""
print("Applying the selection from GAMA survey...")
self.mask *= data[self.config.colnames["r"]] < 19.87
def __repr__(self):
"""
Define how the model is represented and printed.
"""
# start message
printMsg = "Applying the GAMA selection."
return printMsg
[docs]class SpecSelection_BOSS(SpecSelection):
"""
The class of spectroscopic selections with BOSS.
BOSS selection function is based on
http://www.sdss3.org/dr9/algorithms/boss_galaxy_ts.php
The selection has changed slightly compared to Dawson+13
BOSS covers an area of 9100 deg^2 with 893,319 galaxies.
For BOSS selection, the data should at least include gri bands.
"""
name = "specselection_boss"
[docs] def selection(self, data):
"""
The BOSS selection function.
"""
print("Applying the selection from BOSS survey...")
# cut quantities (unchanged)
c_p = 0.7 * (
data[self.config.colnames["g"]] - data[self.config.colnames["r"]]
) + 1.2 * (
data[self.config.colnames["r"]] - data[self.config.colnames["i"]] - 0.18
)
c_r = (
(data[self.config.colnames["r"]] - data[self.config.colnames["i"]])
- (data[self.config.colnames["g"]] - data[self.config.colnames["r"]]) / 4.0
- 0.18
)
d_r = (data[self.config.colnames["r"]] - data[self.config.colnames["i"]]) - (
data[self.config.colnames["g"]] - data[self.config.colnames["r"]]
) / 8.0
# defining the LOWZ sample
# we cannot apply the r_psf - r_cmod cut
low_z = (
(data[self.config.colnames["r"]] > 16.0)
& (data[self.config.colnames["r"]] < 20.0)
& (np.abs(c_r) < 0.2) # 19.6
& (data[self.config.colnames["r"]] < 13.35 + c_p / 0.3)
) # 13.5, 0.3
# defining the CMASS sample
# we cannot apply the i_fib2, i_psf - i_mod and z_psf - z_mod cuts
cmass = (
(data[self.config.colnames["i"]] > 17.5)
& (data[self.config.colnames["i"]] < 20.1)
& (d_r > 0.55) # 19.9
& (data[self.config.colnames["i"]] < 19.98 + 1.6 * (d_r - 0.7))
& ( # 19.86, 1.6, 0.8
(data[self.config.colnames["r"]] - data[self.config.colnames["i"]])
< 2.0
)
)
# NOTE: we ignore the CMASS sparse sample
self.mask *= low_z | cmass
def __repr__(self):
"""
Define how the model is represented and printed.
"""
# start message
printMsg = "Applying the BOSS selection."
return printMsg
[docs]class SpecSelection_DEEP2(SpecSelection):
"""
The class of spectroscopic selections with DEEP2.
DEEP2 has a sky coverage of 2.8 deg^2 with ~53000 spectra.
For DEEP2, one needs R band magnitude, B-R/R-I colors which are not available for the time being.
So we use LSST gri bands now. When the conversion degrader is ready, this subclass will be updated
accordingly.
"""
name = "specselection_deep2"
[docs] def photometryCut(self, data):
"""
Applying DEEP2 photometric cut based on Newman+13.
This modified selection gives the best match to the data n(z) with
its cut at z~0.75 and the B-R/R-I distribution (Newman+13, Fig. 12)
Notes
-----
We cannot apply the surface brightness cut and do not apply the
Gaussian weighted sampling near the original colour cuts.
"""
mask = (
(data[self.config.colnames["r"]] > 18.5)
& (data[self.config.colnames["r"]] < 24.1)
& ( # 24.1
(
data[self.config.colnames["g"]] - data[self.config.colnames["r"]]
< 2.45
* (
data[self.config.colnames["r"]]
- data[self.config.colnames["i"]]
)
- 0.2976
)
|
# 2.45, 0.2976
(
data[self.config.colnames["r"]] - data[self.config.colnames["i"]]
> 1.1
)
| (
data[self.config.colnames["g"]] - data[self.config.colnames["r"]]
< 0.5
)
)
) # 0.5
# update the internal state
self.mask &= mask
[docs] def speczSuccess(self, data):
"""
Spec-z success rate as function of r_AB for Q>=3 read of Figure 13 in
Newman+13 for DEEP2 fields 2-4. Values are binned in steps of 0.2 mag
with the first and last bin centered on 19 and 24.
"""
success_R_bins = np.arange(18.9, 24.1 + 0.01, 0.2)
success_R_centers = (success_R_bins[1:] + success_R_bins[:-1]) / 2.0
# paper has given 1 - [sucess rate] in the histogram
success_rate_dir = self.config.success_rate_dir
success_R_rate = np.loadtxt(os.path.join(success_rate_dir, "DEEP2_success.txt"))
# interpolate the success rate as probability of being selected with
# the probability at R > 24.1 being 0
p_success_R = interp1d(
success_R_centers,
success_R_rate,
kind="quadratic",
bounds_error=False,
fill_value=(success_R_rate[0], 0.0),
)
# Randomly sample objects according to their success rate
random_draw = self.rng.random(len(data))
mask = random_draw < p_success_R(data[self.config.colnames["r"]])
# update the internal state
self.mask &= mask
[docs] def selection(self, data):
"""
DEEP2 selection function
"""
self.photometryCut(data)
self.speczSuccess(data)
def __repr__(self):
"""
Define how the model is represented and printed.
"""
# start message
printMsg = "Applying the DEEP2 selection."
return printMsg
[docs]class SpecSelection_VVDSf02(SpecSelection):
"""
The class of spectroscopic selections with VVDSf02.
It covers an area of 0.5 deg^2 with ~10000 sources.
Necessary columns are i band magnitude and redshift.
"""
name = "specselection_VVDSf02"
[docs] def photometryCut(self, data):
"""
Photometric cut of VVDS 2h-field based on LeFèvre+05.
Notes
-----
The oversight of 1.0 magnitudes on the bright end misses 0.2% of galaxies.
"""
mask = (data[self.config.colnames["i"]] > 17.5) & (
data[self.config.colnames["i"]] < 24.0
)
# 17.5, 24.0
self.mask &= mask
[docs] def speczSuccess(self, data):
"""
Success rate of VVDS 2h-field
Notes
-----
We use a redshift-based and I-band based success rate
independently here since we do not know their correlation,
which makes the success rate worse than in reality.
Spec-z success rate as function of i_AB read of Figure 16 in
LeFevre+05 for the VVDS 2h field. Values are binned in steps of
0.5 mag with the first starting at 17 and the last bin ending at 24.
"""
success_I_bins = np.arange(17.0, 24.0 + 0.01, 0.5)
success_I_centers = (success_I_bins[1:] + success_I_bins[:-1]) / 2.0
success_rate_dir = self.config.success_rate_dir
success_I_rate = np.loadtxt(
os.path.join(success_rate_dir, "VVDSf02_I_success.txt")
)
# interpolate the success rate as probability of being selected with
# the probability at I > 24 being 0
p_success_I = interp1d(
success_I_centers,
success_I_rate,
kind="quadratic",
bounds_error=False,
fill_value=(success_I_rate[0], 0.0),
)
# Randomly sample objects according to their success rate
random_draw = self.rng.random(len(data))
mask = random_draw < p_success_I(data["mag_i_lsst"])
# Spec-z success rate as function of redshift read of Figure 13a/b in
# LeFevre+13 for VVDS deep sample. The listing is split by i_AB into
# ranges (17.5; 22.5] and (22.5; 24.0].
# NOTE: at z > 1.75 there are only lower limits (due to a lack of
# spec-z?), thus the success rate is extrapolated as 1.0 at z > 1.75
success_z_bright_centers, success_z_bright_rate = np.loadtxt(
os.path.join(success_rate_dir, "VVDSf02_z_bright_success.txt")
).T
success_z_deep_centers, success_z_deep_rate = np.loadtxt(
os.path.join(success_rate_dir, "VVDSf02_z_deep_success.txt")
).T
# interpolate the success rates as probability of being selected with
# the probability in the bright bin at z > 1.75 being 1.0 and the deep
# bin at z > 4.0 being 0.0
p_success_z_bright = interp1d(
success_z_bright_centers,
success_z_bright_rate,
kind="quadratic",
bounds_error=False,
fill_value=(success_z_bright_rate[0], 1.0),
)
p_success_z_deep = interp1d(
success_z_deep_centers,
success_z_deep_rate,
kind="quadratic",
bounds_error=False,
fill_value=(success_z_deep_rate[0], 0.0),
)
# Randomly sample objects according to their success rate
random_draw = self.rng.random(len(data))
iterator = zip(
[
data[self.config.colnames["i"]] <= 22.5,
data[self.config.colnames["i"]] > 22.5,
],
[p_success_z_bright, p_success_z_deep],
)
for m, p_success_z in iterator:
mask[m] &= random_draw[m] < p_success_z(
data[self.config.colnames["redshift"]][m]
)
# update the internal state
self.mask &= mask
[docs] def selection(self, data):
self.photometryCut(data)
self.speczSuccess(data)
def __repr__(self):
"""
Define how the model is represented and printed.
"""
# start message
printMsg = "Applying the VVDSf02 selection."
return printMsg
[docs]class SpecSelection_zCOSMOS(SpecSelection):
"""
The class of spectroscopic selections with zCOSMOS
It covers an area of 1.7 deg^2 with ~20000 galaxies.
For zCOSMOS, the data should at least include i band and redshift.
"""
name = "specselection_zCOSMOS"
[docs] def photometryCut(self, data):
"""
Photometry cut for zCOSMOS based on Lilly+09.
NOTE: This only includes zCOSMOS bright.
update the internal state
"""
mask = (data[self.config.colnames["i"]] > 15.0) & (
data[self.config.colnames["i"]] < 22.5
)
# 15.0, 22.5
self.mask &= mask
[docs] def speczSuccess(self, data):
"""
Spec-z success rate as function of redshift (x) and I_AB (y) read of
Figure 3 in Lilly+09 for zCOSMOS bright sample.
"""
success_rate_dir = self.config.success_rate_dir
x = np.arange(0, 1.4, 0.00587002, dtype=np.float64)
y = np.arange(18, 22.4, 0.01464226, dtype=np.float64)
pixels_y = np.searchsorted(y, data[self.config.colnames["i"]])
pixels_x = np.searchsorted(x, data[self.config.colnames["redshift"]])
rates = np.loadtxt(os.path.join(success_rate_dir, "zCOSMOS_success.txt"))
ratio_list = np.zeros(len(pixels_y))
for i, py in enumerate(pixels_y):
if (
(py >= rates.shape[0])
or (pixels_x[i] >= rates.shape[1])
or (py == 0)
or (pixels_x[i] == 0)
):
ratio_list[i] = 0
else:
ratio_list[i] = rates[pixels_y[i] - 1][pixels_x[i] - 1]
randoms = self.rng.uniform(size=data[self.config.colnames["i"]].size)
mask = randoms <= ratio_list
self.mask &= mask
[docs] def selection(self, data):
self.photometryCut(data)
self.speczSuccess(data)
def __repr__(self):
"""
Define how the model is represented and printed.
"""
# start message
printMsg = "Applying the zCOSMOS selection."
return printMsg
[docs]class SpecSelection_HSC(SpecSelection):
"""
The class of spectroscopic selections with HSC
or HSC, the data should at least include giz bands and redshift.
"""
name = "specselection_HSC"
[docs] def photometryCut(self, data):
"""
HSC galaxies were binned in color magnitude space with i-band mag from -2 to 6 and g-z color from 13 to 26.
"""
mask = (data[self.config.colnames["i"]] > 13.0) & (
data[self.config.colnames["i"]] < 26.0
)
self.mask &= mask
gz = data[self.config.colnames["g"]] - data[self.config.colnames["z"]]
mask = (gz > -2.0) & (gz < 6.0)
self.mask &= mask
[docs] def speczSuccess(self, data):
"""
HSC galaxies were binned in color magnitude space with i-band mag from -2 to 6 and g-z color from 13 to 26
200 bins in each direction. The ratio of of galaxies with spectroscopic redshifts (training galaxies) to
galaxies with only photometry in HSC wide field (application galaxies) was computed for each pixel. We divide
the data into the same pixels and randomly select galaxies into the training sample based on the HSC ratios
"""
success_rate_dir = self.config.success_rate_dir
x_edge = np.linspace(13, 26, 201, endpoint=True)
y_edge = np.linspace(-2, 6, 201, endpoint=True)
rates = np.loadtxt(os.path.join(success_rate_dir, "hsc_success.txt"))
pixels_y = np.searchsorted(
y_edge, data[self.config.colnames["g"]] - data[self.config.colnames["z"]]
)
pixels_x = np.searchsorted(x_edge, data[self.config.colnames["i"]])
# Do the color-based, percentile-based redshift cut
percentile_cut = self.config.percentile_cut
mask_keep = np.ones_like(data[self.config.colnames["i"]])
if percentile_cut != 100: # pragma: no cover
pixels_y_unique = np.unique(pixels_y)
pixels_x_unique = np.unique(pixels_x)
for y in pixels_y_unique:
for x in pixels_x_unique:
ind_inpix = np.where((pixels_y == y) * (pixels_x == x))[0]
if ind_inpix.size == 0:
continue
redshifts = data[self.config.colnames["redshift"]][ind_inpix]
percentile = np.percentile(redshifts, percentile_cut)
ind_remove = ind_inpix[redshifts > percentile]
mask_keep[ind_remove] = 0
self.mask &= mask_keep
pixels_y = pixels_y - 1
pixels_x = pixels_x - 1
ratio_list = np.zeros(len(pixels_y))
for i, py in enumerate(pixels_y):
if (py >= rates.shape[0]) or (pixels_x[i] >= rates.shape[1]):
ratio_list[i] = 0
else:
ratio_list[i] = rates[pixels_y[i]][pixels_x[i]]
randoms = self.rng.uniform(size=data[self.config.colnames["i"]].size)
mask = randoms <= ratio_list
self.mask &= mask
[docs] def selection(self, data):
self.photometryCut(data)
self.speczSuccess(data)
def __repr__(self):
"""
Define how the model is represented and printed.
"""
# start message
printMsg = "Applying the HSC selection."
return printMsg