Source code for o3api.prepare

# -*- coding: utf-8 -*-
# Copyright (c) 2017 - 2020 Karlsruhe Institute of Technology - Steinbuch Centre for Computing
# This code is distributed under the MIT License
# Please, see the LICENSE file
# @author: vykozlov

import logging
import numpy as np
import o3api.config as cfg
import pandas as pd

logger = logging.getLogger('__name__') #o3api

# configuration for netCDF
TIME = cfg.netCDF_conf['t_c']
LAT = cfg.netCDF_conf['lat_c']
TCO3 = cfg.netCDF_conf['tco3']
VMRO3 = cfg.netCDF_conf['vmro3']
TCO3Return = cfg.netCDF_conf['tco3_r']

# configuration for API
api_c = cfg.api_conf

[docs]class PrepareData(): """Class to perform data selection, based on :class:`Dataset`. :param begin: Year to start data scanning from :param end: Year to finish data scanning :param month: Month(s) to select, if not a whole year :param lat_min: Minimum latitude to define the range (-90..90) :param lat_max: Maximum latitude to define the range (-90..90) """ def __init__ (self, data, **kwargs): """Constructor method """ = data self.plot_type = kwargs[api_c['plot_t']] self.begin = kwargs[api_c['begin']] self.end = kwargs[api_c['end']] self.month = kwargs[api_c['month']] self.lat_min = kwargs[api_c['lat_min']] self.lat_max = kwargs[api_c['lat_max']] def __check_latitude_order(self, ds): """Function to check the latitude order, returns them correctly ordered :param ds: xarray dataset to check :return: lat_0, lat_last """ # check in what order latitude is used, e.g. (-90..90) or (90..-90) lat_0 = np.amin(ds.coords[LAT].values[0]) # min latitude lat_last = np.amax(ds.coords[LAT].values[-1]) # max latitude if lat_0 < lat_last: lat_a = self.lat_min lat_b = self.lat_max else: lat_a = self.lat_max lat_b = self.lat_min return lat_a, lat_b
[docs] def get_dataslice(self, model): """Function to select the slice of data according to the time and latitude requested :param model: The model to process :return: xarray dataset selected according to the time and latitude :rtype: xarray """ ds =[model] logger.debug(F"{model}: Dataset is loaded from the storage location") # check in what order latitude is used, return them correspondently lat_a, lat_b = self.__check_latitude_order(ds) # select data according to the period and latitude # BUG(?) ccmi-umukca-ucam complains about 31-12-year, but 30-12-year works # CFTime360day date format has 30 days for every month??? # {}-01-01T00:00:00 .. {}-12-30T23:59:59 if len(self.month) > 0: if all(x in range(1,13) for x in self.month): ds = ds.sel(time=ds.time.dt.month.isin(self.month)) else: logger.warning(F"Wrong month number! Using whole year range.\ Check values: {self.month}.") ds_slice = ds.sel(time=slice(F"{self.begin}-01", F"{self.end}-12"), lat=slice(lat_a, lat_b)) # latitude #print("get_dataslice:", model, ds) # maybe skip years selection here? performance? #ds_slice = ds.sel(lat=slice(lat_a, # lat_b)) # latitude return ds_slice
[docs] def to_pd_dataframe(self, ds, model) -> pd.DataFrame: """Convert xarray variable to pandas dataframe (faster method?) :param ds: xarray dataset :param model: The model to process for self.plot_type :return dataset as pandas dataframe :rtype: pandas dataframe """ # convert to pandas series to keep date information # different time axes should be harmonized in o3skim.. if (type(ds.indexes[TIME]) is pd.core.indexes.datetimes.DatetimeIndex) : time_axis = ds.indexes[TIME].values else: # convert CFTimeIndex to pd.DatetimeIndex, turn Warnings Off (unsafe=True) time_axis = ds.indexes[TIME].to_datetimeindex() pd_model = pd.DataFrame({ model: np.nan_to_num(ds[self.plot_type]), 'time': time_axis}).replace({0: np.nan}) # set index to 'time', also important for performance pd.join() (?) pd_model = pd_model.set_index('time') return pd_model
[docs] def get_raw_data_pd(self, model) -> pd.DataFrame: """Process the model to get tco3_zm raw data :param model: The model to process for tco3_zm :return: raw data points in preparation for plotting :rtype: pd.DataFrame """ # data selection according to time and latitude ds_slice = self.get_dataslice(model) ds_plot_type = ds_slice[[TCO3]].mean(dim=[LAT]) logger.debug("ds_plot_type: {}".format(ds_plot_type)) data = self.to_pd_dataframe(ds_plot_type, model) return data
[docs] def get_raw_ensemble_pd(self, models) -> pd.DataFrame: """Build the ensemble of tco3_zm models :param models: Models to process for tco3_zm :return: ensemble of models as pd.DataFrame :rtype: pd.DataFrame """ data = self.get_raw_data_pd(models[0]) # initialize with first model if len(models) > 1: ## PERFORMANCE? map() and join should be faster than 'for' and merge # how="outer" is important in order to keep all indecies/dates data_list = map(self.get_raw_data_pd, models[1:]) data = data.join(data_list, how="outer") ## previous method uses merge #for m in models[1:]: # data = data.merge(self.get_raw_data_pd(m), # how='outer', # on=['time']) ## return data.sort_index()