Source code for o3api.prepare

# -*- coding: utf-8 -*-
#
# Copyright (c) 2017 - 2020 Karlsruhe Institute of Technology - Steinbuch Centre for Computing
# This code is distributed under the MIT License
# Please, see the LICENSE file
#
# @author: vykozlov

import logging
import numpy as np
import o3api.config as cfg
import pandas as pd

logger = logging.getLogger('__name__') #o3api
logger.setLevel(cfg.log_level)

# configuration for netCDF
TIME = cfg.netCDF_conf['t_c']
LAT = cfg.netCDF_conf['lat_c']
TCO3 = cfg.netCDF_conf['tco3']
VMRO3 = cfg.netCDF_conf['vmro3']
TCO3Return = cfg.netCDF_conf['tco3_r']

# configuration for API
api_c = cfg.api_conf


[docs]class PrepareData():
    """Class to perform data selection, based on :class:`Dataset`.

    :param begin: Year to start data scanning from
    :param end: Year to finish data scanning
    :param month: Month(s) to select, if not a whole year
    :param lat_min: Minimum latitude to define the range (-90..90)
    :param lat_max: Maximum latitude to define the range (-90..90)
    """

    def __init__ (self, data, **kwargs):
        """Constructor method
        """
        self.data = data
        self.plot_type = kwargs[api_c['plot_t']]
        self.begin = kwargs[api_c['begin']]
        self.end = kwargs[api_c['end']]
        self.month = kwargs[api_c['month']]
        self.lat_min = kwargs[api_c['lat_min']]
        self.lat_max = kwargs[api_c['lat_max']]

    def __check_latitude_order(self, ds):
        """Function to check the latitude order, 
        returns them correctly ordered

        :param ds: xarray dataset to check
        :return: lat_0, lat_last
        """
        # check in what order latitude is used, e.g. (-90..90) or (90..-90)
        lat_0 = np.amin(ds.coords[LAT].values[0]) # min latitude
        lat_last = np.amax(ds.coords[LAT].values[-1]) # max latitude

        if lat_0 < lat_last:
            lat_a = self.lat_min
            lat_b = self.lat_max
        else:
            lat_a = self.lat_max
            lat_b = self.lat_min

        return lat_a, lat_b
       
[docs]    def get_dataslice(self, model):
        """Function to select the slice of data according 
        to the time and latitude requested

        :param model: The model to process
        :return: xarray dataset selected according to the time and latitude
        :rtype: xarray
        """
        ds = self.data[model]
        logger.debug(F"{model}: Dataset is loaded from the storage location")
        
        # check in what order latitude is used, return them correspondently
        lat_a, lat_b = self.__check_latitude_order(ds)

        # select data according to the period and latitude
        # BUG(?) ccmi-umukca-ucam complains about 31-12-year, but 30-12-year works
        # CFTime360day date format has 30 days for every month???
        # {}-01-01T00:00:00 .. {}-12-30T23:59:59
        if len(self.month) > 0:
            if all(x in range(1,13) for x in self.month):
                ds = ds.sel(time=ds.time.dt.month.isin(self.month))
            else:
                logger.warning(F"Wrong month number! Using whole year range.\
                Check values: {self.month}.")

        ds_slice = ds.sel(time=slice(F"{self.begin}-01", 
                                     F"{self.end}-12"),
                          lat=slice(lat_a,
                                    lat_b))  # latitude
        #print("get_dataslice:", model, ds)
        # maybe skip years selection here? performance?
        #ds_slice = ds.sel(lat=slice(lat_a,
        #                            lat_b))  # latitude

        return ds_slice

[docs]    def to_pd_dataframe(self, ds, model) -> pd.DataFrame:
        """Convert xarray variable to pandas dataframe (faster method?)
        
        :param ds: xarray dataset
        :param model: The model to process for self.plot_type
        :return dataset as pandas dataframe
        :rtype: pandas dataframe
        """

        # convert to pandas series to keep date information
        # different time axes should be harmonized in o3skim..
        
        if (type(ds.indexes[TIME]) is 
            pd.core.indexes.datetimes.DatetimeIndex) :
            time_axis = ds.indexes[TIME].values
        else:
            # convert CFTimeIndex to pd.DatetimeIndex, turn Warnings Off (unsafe=True)
            time_axis = ds.indexes[TIME].to_datetimeindex()

        pd_model = pd.DataFrame({ model: np.nan_to_num(ds[self.plot_type]),
                                  'time': time_axis}).replace({0: np.nan})
        # set index to 'time', also important for performance pd.join() (?)
        pd_model = pd_model.set_index('time')

        return pd_model

[docs]    def get_raw_data_pd(self, model) -> pd.DataFrame:
        """Process the model to get tco3_zm raw data

        :param model: The model to process for tco3_zm
        :return: raw data points in preparation for plotting
        :rtype: pd.DataFrame
        """
        # data selection according to time and latitude
        ds_slice = self.get_dataslice(model)
        ds_plot_type = ds_slice[[TCO3]].mean(dim=[LAT])
        logger.debug("ds_plot_type: {}".format(ds_plot_type))

        data = self.to_pd_dataframe(ds_plot_type, model)
        return data

[docs]    def get_raw_ensemble_pd(self, models) -> pd.DataFrame:
        """Build the ensemble of tco3_zm models

        :param models: Models to process for tco3_zm
        :return: ensemble of models as pd.DataFrame
        :rtype: pd.DataFrame
        """

        data = self.get_raw_data_pd(models[0]) # initialize with first model
        if len(models) > 1:
            ## PERFORMANCE? map() and join should be faster than 'for' and merge
            # how="outer" is important in order to keep all indecies/dates
            data_list = map(self.get_raw_data_pd, models[1:])
            data = data.join(data_list, how="outer")

            ## previous method uses merge
            #for m in models[1:]:
            #   data = data.merge(self.get_raw_data_pd(m), 
            #                     how='outer',
            #                     on=['time'])
            ##

        return data.sort_index()