Source code for qf_lib.common.utils.factorization.manager

#     Copyright 2016-present CERN – European Organization for Nuclear Research
#
#     Licensed under the Apache License, Version 2.0 (the "License");
#     you may not use this file except in compliance with the License.
#     You may obtain a copy of the License at
#
#         http://www.apache.org/licenses/LICENSE-2.0
#
#     Unless required by applicable law or agreed to in writing, software
#     distributed under the License is distributed on an "AS IS" BASIS,
#     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#     See the License for the specific language governing permissions and
#     limitations under the License.

from typing import Tuple

from qf_lib.common.enums.frequency import Frequency
from qf_lib.common.utils.data_cleaner import DataCleaner
from qf_lib.common.utils.dateutils.get_values_common_dates import get_values_for_common_dates
from qf_lib.common.utils.factorization.data_models.data_model import DataModel
from qf_lib.common.utils.factorization.data_models.data_model_input import DataModelInput
from qf_lib.common.utils.factorization.data_models.rolling_data_model import RollingDataModel
from qf_lib.common.utils.factorization.factors_identification.factors_identifier import FactorsIdentifier
from qf_lib.common.utils.logging.qf_parent_logger import qf_logger
from qf_lib.containers.dataframe.qf_dataframe import QFDataFrame
from qf_lib.containers.series.qf_series import QFSeries
from qf_lib.containers.series.simple_returns_series import SimpleReturnsSeries


[docs]class FactorizationManager: """ Facade class for factorization. Parameters ---------- analysed_tms must have a set name in order to be displayed properly later on regressors_df must have a set name for each column in order to be displayed properly later on frequency frequency of every series (the same for all) factors_identifier class used for identifying significant factors for the model (picks them up from regressors_df) is_fit_intercept default True; True if the calculated model should include the intercept coefficient """ def __init__(self, analysed_tms: QFSeries, regressors_df: QFDataFrame, frequency: Frequency, factors_identifier: FactorsIdentifier, is_fit_intercept: bool = True): self.logger = qf_logger.getChild(self.__class__.__name__) self.analysed_tms = analysed_tms.to_simple_returns() self.regressors_df = regressors_df.to_simple_returns() self.frequency = frequency self.factors_identifier = factors_identifier self.is_fit_intercept = is_fit_intercept self.used_regressors_ = None # data frame of regressors used in the model self.used_fund_returns_ = None # analysed timeseries without dates unused in the regression self.coefficients_vector_ = None # vector of coefficients for each regressor used in the model self.intercept_ = None # the independent term in a linear model
[docs] def extract_data_for_analysis(self) -> Tuple[QFDataFrame, QFSeries]: """ Extracts data which is useful for building the model explaining the fund's timeseries. Returns ------- Tuple[QFDataFrame, QFSeries] Dataframe containing only those regressors which are useful for modeling fund's timeseries and a Timeseries of fund which is preprocessed (cleaned data) """ common_regressors_df, common_analysed_tms = self._preprocess_data(self.analysed_tms, self.regressors_df) selected_regressors_df = \ self.factors_identifier.select_best_factors(common_regressors_df, common_analysed_tms) self.used_regressors_ = selected_regressors_df self.used_fund_returns_ = common_analysed_tms return selected_regressors_df, common_analysed_tms
[docs] def get_factorization_data_model(self) -> DataModel: """ Creates model explaining fund's timeseries. """ model_input = DataModelInput(self.used_regressors_, self.used_fund_returns_, self.frequency, self.is_fit_intercept) data_model = DataModel(model_input) data_model.setup() return data_model
[docs] def get_rolling_factorization_data_model(self) -> RollingDataModel: """ Creates multiple models explaining fund's timeseries (one model for each time window). """ model_input = DataModelInput(self.used_regressors_, self.used_fund_returns_, self.frequency, self.is_fit_intercept) data_model = RollingDataModel(model_input) data_model.setup() return data_model
def _preprocess_data(self, analysed_tms, regressors_df): """ Cleans the data before they are processed (e.g. removes regressors containing too many missing data, proxies missing data). """ self.logger.debug("Length of input timeseries: {:d} \n".format(len(analysed_tms))) data_cleaner = DataCleaner(regressors_df) common_regressors_df = data_cleaner.proxy_using_regression(analysed_tms, columns_type=SimpleReturnsSeries) common_regressors_df, common_analysed_tms = get_values_for_common_dates(common_regressors_df, analysed_tms) self.logger.debug("Length of preprocessed timeseries: {:d}".format(common_analysed_tms.size)) self.logger.debug("Number of regressors: {:d}".format(common_regressors_df.shape[1])) return common_regressors_df, common_analysed_tms