相关的辅助函数

In [1]:
import pandas as pd
import os
import tempfile
import zipfile
import glob
from tqdm import tqdm
import math
import requests

#colour==0.1.5
#cvxpy==1.0.3
#cycler==0.10.0
#numpy==1.13.3
#pandas==0.21.1
#plotly==2.2.3
#pyparsing==2.2.0
#python-dateutil==2.6.1
#pytz==2017.3
#requests==2.18.4
#scipy==1.0.0
#scikit-learn==0.19.1
#six==1.11.0
#tqdm==4.19.5


color_scheme = {
    'index': '#B6B2CF',
    'etf': '#2D3ECF',
    'tracking_error': '#6F91DE',
    'df_header': 'silver',
    'df_value': 'white',
    'df_line': 'silver',
    'heatmap_colorscale': [(0, '#6F91DE'), (0.5, 'grey'), (1, 'red')],
    'background_label': '#9dbdd5',
    'low_value': '#B6B2CF',
    'high_value': '#2D3ECF',
    'y_axis_2_text_color': 'grey',
    'shadow': 'rgba(0, 0, 0, 0.75)',
    'major_line': '#2D3ECF',
    'minor_line': '#B6B2CF',
    'main_line': 'black'}


def download_quandl_dataset(quandl_api_key, database, dataset, save_path, columns, tickers, start_date, end_date):
    """
    Download a dataset from Quandl and save it to `save_path`.
    Filter by columns, tickers, and date
    :param quandl_api_key: The Quandl API key
    :param database: The Quandl database to download from
    :param dataset: The dataset to download
    :param save_path: The path to save the dataset
    :param columns: The columns to save
    :param tickers: The tickers to save
    :param start_date: The rows to save that are older than this date
    :param end_date: The rows to save that are younger than this date
    """
    scrape_url = 'https://www.quandl.com/api/v3/datatables/{}/{}?qopts.export=true&api_key={}'\
        .format(database, dataset, quandl_api_key)
    scrape_request = requests.get(scrape_url)
    bulk_download_url = scrape_request.json()['datatable_bulk_download']['file']['link']

    with tempfile.TemporaryDirectory() as tmp_dir:
        tmp_wiki_file = tmp_dir + 'tmp.zip'

        bulk_download_request = requests.get(bulk_download_url, stream=True, cookies=scrape_request.cookies)
        total_size = int(bulk_download_request.headers.get('content-length', 0));
        block_size = 1024 * 1024
        with open(tmp_wiki_file, 'wb') as f:
            for data in tqdm(
                    bulk_download_request.iter_content(block_size),
                    total=math.ceil(total_size // block_size),
                    unit='MB',
                    unit_scale=True,
                    desc='Downloading Data'):
                f.write(data)

        with tqdm(total=5, desc='Transforming Data', unit='Action') as pbar:
            # Unzip downloaded data
            zip_ref = zipfile.ZipFile(tmp_wiki_file, 'r')
            zip_ref.extractall(tmp_dir)
            zip_ref.close()
            pbar.update(1)

            # Check if the zip file only contains one csv file
            #   We're assuming that Quandl will always give us the data in a single csv file.
            #   If it's different, we want to throw an error.
            csv_files = glob.glob(os.path.join(tmp_dir, '*.csv'))
            assert len(csv_files) == 1,\
                'Bulk download of Quandl Wiki data failed. Wrong number of csv files found. Found {} file(s).'\
                    .format(len(csv_files))
            tmp_csv_file = csv_files[0]

            tmp_df = pd.read_csv(tmp_csv_file)
            pbar.update(1)
            tmp_df['date'] = pd.to_datetime(tmp_df['date'])
            pbar.update(1)

            # Remove unused data and save
            tmp_df = tmp_df[tmp_df['date'].isin(pd.date_range(start_date, end_date))]  # Filter unused dates
            tmp_df = tmp_df[tmp_df['ticker'].isin(tickers)]  # Filter unused tickers
            pbar.update(1)
            tmp_df.to_csv(save_path, columns=columns, index=False)  # Filter unused columns and save
            pbar.update(1)


def generate_config():
    return {'showLink': False, 'displayModeBar': False, 'showAxisRangeEntryBoxes': True}

利用plotly做图

In [2]:
import plotly.graph_objs as go
import plotly.offline as offline_py
offline_py.init_notebook_mode(connected=True)


def _generate_stock_trace(prices):
    return go.Scatter(
        name='Index',
        x=prices.index,
        y=prices,
        line={'color': color_scheme['major_line']})


def _generate_traces(name_df_color_data):
    traces = []

    for name, df, color in name_df_color_data:
        traces.append(go.Scatter(
            name=name,
            x=df.index,
            y=df,
            mode='line',
            line={'color': color}))

    return traces


def print_dataframe(df, n_rows=10, n_columns=3):
    missing_val_str = '...'
    config = generate_config()

    formatted_df = df.iloc[:n_rows, :n_columns]
    formatted_df = formatted_df.applymap('{:.3f}'.format)

    if len(df.columns) > n_columns:
        formatted_df[missing_val_str] = [missing_val_str]*len(formatted_df.index)
    if len(df.index) > n_rows:
        formatted_df.loc[missing_val_str] = [missing_val_str]*len(formatted_df.columns)

    trace = go.Table(
        type='table',
        columnwidth=[1, 3],
        header={
            'values': [''] + list(formatted_df.columns.values),
            'line': {'color': helper.color_scheme['df_line']},
            'fill': {'color': helper.color_scheme['df_header']},
            'font': {'size': 13}},
        cells={
            'values': formatted_df.reset_index().values.T,
            'line': {'color': color_scheme['df_line']},
            'fill': {'color': [color_scheme['df_header'], color_scheme['df_value']]},
            'font': {'size': 13}})

    offline_py.iplot([trace], config=config)


def plot_stock(prices, title):
    config = generate_config()
    layout = go.Layout(title=title)

    stock_trace = _generate_stock_trace(prices)

    offline_py.iplot({'data': [stock_trace], 'layout': layout}, config=config)



def plot_resampled_prices(df_resampled, df, title):
    config = generate_config()
    layout = go.Layout(title=title)

    traces = _generate_traces([
        ('Monthly Close', df_resampled, color_scheme['major_line']),
        ('Close', df, color_scheme['minor_line'])])

    offline_py.iplot({'data': traces, 'layout': layout}, config=config)


def plot_returns(returns, title):
    config = generate_config()
    layout = go.Layout(title=title)

    traces = _generate_traces([
        ('Returns', returns, color_scheme['major_line'])])

    offline_py.iplot({'data': traces, 'layout': layout}, config=config)


def plot_shifted_returns(df_shited, df, title):
    config = generate_config()
    layout = go.Layout(title=title)

    traces = _generate_traces([
        ('Shifted Returns', df_shited, color_scheme['major_line']),
        ('Returns', df, color_scheme['minor_line'])])

    offline_py.iplot({'data': traces, 'layout': layout}, config=config)


def print_top(df, name, top_n=10):
    print('{} Most {}:'.format(top_n, name))
    print(', '.join(df.sum().sort_values(ascending=False).index[:top_n].values.tolist()))

加载相关的工具包

In [3]:
import pandas as pd
import numpy as np
import project_tests

市场数据

In [4]:
df = pd.read_csv('eod-quotemedia.csv', parse_dates=['date'], index_col=False)
In [5]:
df.head(10)
Out[5]:
Unnamed: 0 date ticker adj_close
0 0 2013-07-01 A 29.99418563
1 1 2013-07-02 A 29.65013670
2 2 2013-07-03 A 29.70518453
3 3 2013-07-05 A 30.43456826
4 4 2013-07-08 A 30.52402098
5 5 2013-07-09 A 30.68916447
6 6 2013-07-10 A 31.17771395
7 7 2013-07-11 A 31.45983407
8 8 2013-07-12 A 31.48047700
9 9 2013-07-15 A 31.72819223
In [6]:
close = df.reset_index().pivot(index='date', columns='ticker', values='adj_close')
In [7]:
close.head(10)
Out[7]:
ticker A AAL AAP AAPL ABBV ABC ABT ACN ADBE ADI ... XL XLNX XOM XRAY XRX XYL YUM ZBH ZION ZTS
date
2013-07-01 29.99418563 16.17609308 81.13821681 53.10917319 34.92447839 50.86319750 31.42538772 64.69409505 46.23500000 39.91336014 ... 27.66879066 35.28892781 76.32080247 40.02387348 22.10666494 25.75338607 45.48038323 71.89882693 27.85858718 29.44789315
2013-07-02 29.65013670 15.81983388 80.72207258 54.31224742 35.42807578 50.69676639 31.27288084 64.71204071 46.03000000 39.86057632 ... 27.54228410 35.05903252 76.60816761 39.96552964 22.08273998 25.61367511 45.40266113 72.93417195 28.03893238 28.57244125
2013-07-03 29.70518453 16.12794994 81.23729877 54.61204262 35.44486235 50.93716689 30.72565028 65.21451912 46.42000000 40.18607651 ... 27.33445191 35.28008569 76.65042719 40.00442554 22.20236479 25.73475794 46.06329899 72.30145844 28.18131017 28.16838652
2013-07-05 30.43456826 16.21460758 81.82188233 54.17338125 35.85613355 51.37173702 31.32670680 66.07591068 47.00000000 40.65233352 ... 27.69589920 35.80177117 77.39419581 40.67537968 22.58516418 26.06075017 46.41304845 73.16424628 29.39626730 29.02459772
2013-07-08 30.52402098 16.31089385 82.95141667 53.86579916 36.66188936 52.03746147 31.76628544 66.82065546 46.62500000 40.25645492 ... 27.98505704 35.20050655 77.96892611 40.64620776 22.48946433 26.22840332 46.95062632 73.89282298 29.57661249 29.76536472
2013-07-09 30.68916447 16.71529618 82.43619048 54.81320389 36.35973093 51.69535307 31.16522893 66.48866080 47.26000000 40.69632003 ... 28.31939579 35.50113886 78.89018496 40.80179133 22.48946433 26.58233774 47.28094525 73.70108798 28.91218282 29.80384612
2013-07-10 31.17771395 16.53235227 81.99032166 54.60295791 36.85493502 52.28710814 31.16522893 66.71298151 47.25000000 41.10979324 ... 27.95794850 36.39419366 78.45068533 40.71427558 22.96796358 26.98284247 47.08340158 74.00785631 28.32368796 29.86156823
2013-07-11 31.45983407 16.72492481 82.00022986 55.45406479 37.08155384 53.72026495 31.85599537 67.47567196 47.99000000 42.22705062 ... 28.50011944 37.00430040 78.83102155 41.01571874 23.23113816 27.03872686 46.54333492 74.93774876 27.84909533 29.74612402
2013-07-12 31.48047700 16.90786872 81.91105609 55.35309481 38.15724076 53.98840397 31.81096287 67.76280247 48.39000000 42.53495620 ... 28.92482002 38.00346072 78.94089646 40.83096325 23.49431274 27.08529718 45.96422730 75.68549560 28.44708204 30.15979909
2013-07-15 31.72819223 17.10044125 82.61453801 55.47379158 37.79303181 53.84971137 31.95506689 68.41781897 48.12000000 42.57894271 ... 29.27723113 38.17146113 78.81411772 40.84068723 23.54216266 27.06666905 46.69299195 76.27027369 28.77929688 30.38106716

10 rows × 495 columns

股票样例

In [8]:
apple_ticker = 'AAPL'
plot_stock(close[apple_ticker], '{} Stock'.format(apple_ticker))

按月分重新抽样收盘价格

本项目使用月收盘价格每月交易一次,所以需要按照月重新抽样每月的收盘价

In [9]:
def resample_prices(close_prices, freq='M'):
    """
    Resample close prices for each ticker at specified frequency.
    
    Parameters
    ----------
    close_prices : DataFrame
        Close prices for each ticker and date
    freq : str
        What frequency to sample at
        For valid freq choices, see http://pandas.pydata.org/pandas-docs/stable/timeseries.html#offset-aliases
    
    Returns
    -------
    prices_resampled : DataFrame
        Resampled prices for each ticker and date
    """
    # TODO: Implement Function

    #return close_prices.groupby(pd.Grouper(freq=freq)).last()
    return close_prices.resample(freq).last()

project_tests.test_resample_prices(resample_prices)
Tests Passed
In [10]:
monthly_close = resample_prices(close)
plot_resampled_prices(
    monthly_close.loc[:, apple_ticker],
    close.loc[:, apple_ticker],
    '{} Stock - Close Vs Monthly Close'.format(apple_ticker))

计算股票的对数收益

log return $= R = log_e(\frac{P_t}{P_{t-1}}) = log_e(P_t) - log_e(P_{t-1})$

raw return $= r = \frac{{P_t}-P_{t-1}}{P_{t-1}}$

$R = log_e(r+1)$

$r = e^R -1$

根据连续复利的推倒公式可知股票的对数收益R实际上就是连续复利回报利率

In [11]:
def compute_log_returns(prices):
    """
    Compute log returns for each ticker.
    
    Parameters
    ----------
    prices : DataFrame
        Prices for each ticker and date
    
    Returns
    -------
    log_returns : DataFrame
        Log returns for each ticker and date
    """
    # TODO: Implement Function
    
    return np.log(prices) - np.log(prices.shift(1))

project_tests.test_compute_log_returns(compute_log_returns)
Tests Passed
In [12]:
monthly_close_returns = compute_log_returns(monthly_close)
plot_returns(
    monthly_close_returns.loc[:, apple_ticker],
    'Log Returns of {} Stock (Monthly)'.format(apple_ticker))

收益率偏移(Shift Returns)

实现函数shift_returns在时间序列上移动收益到以前或者未来的时间,例如当shift_n为2时,收益数据像下面这样子:

                           Returns
               A         B         C         D
2013-07-08     0.015     0.082     0.096     0.020     ...
2013-07-09     0.037     0.095     0.027     0.063     ...
2013-07-10     0.094     0.001     0.093     0.019     ...
2013-07-11     0.092     0.057     0.069     0.087     ...
...            ...       ...       ...       ...

the output of the shift_returns function would be:

                        Shift Returns
               A         B         C         D
2013-07-08     NaN       NaN       NaN       NaN       ...
2013-07-09     NaN       NaN       NaN       NaN       ...
2013-07-10     0.015     0.082     0.096     0.020     ...
2013-07-11     0.037     0.095     0.027     0.063     ...
...            ...       ...       ...       ...

Using the same returns data as above, the shift_returns function should generate the following with shift_n as -2:

                        Shift Returns
               A         B         C         D
2013-07-08     0.094     0.001     0.093     0.019     ...
2013-07-09     0.092     0.057     0.069     0.087     ...
...            ...       ...       ...       ...       ...
...            ...       ...       ...       ...       ...
...            NaN       NaN       NaN       NaN       ...
...            NaN       NaN       NaN       NaN       ...

Note: The "..." represents data points we're not showing.

In [13]:
def shift_returns(returns, shift_n):
    """
    Generate shifted returns
    
    Parameters
    ----------
    returns : DataFrame
        Returns for each ticker and date
    shift_n : int
        Number of periods to move, can be positive or negative
    
    Returns
    -------
    shifted_returns : DataFrame
        Shifted returns for each ticker and date
    """
    # TODO: Implement Function
    
    return returns.shift(shift_n)

project_tests.test_shift_returns(shift_returns)
Tests Passed

现在对比下当月收益和下月收益

In [14]:
#前一个月的收益
prev_returns = shift_returns(monthly_close_returns, 1)
#下一个月的收益
lookahead_returns = shift_returns(monthly_close_returns, -1)

plot_shifted_returns(
    prev_returns.loc[:, apple_ticker],
    monthly_close_returns.loc[:, apple_ticker],
    'Previous Returns of {} Stock'.format(apple_ticker))

plot_shifted_returns(
    lookahead_returns.loc[:, apple_ticker],
    monthly_close_returns.loc[:, apple_ticker],
    'Lookahead Returns of {} Stock'.format(apple_ticker))

生成交易信号

交易信号是一序列的交易动作或者可以交易的结果。一个共同的形式是每天产生长短头寸结合的投资组合(例如每月月末,或者其他你想要的交易频率)。在交易日可以根据交易信息重新平衡投资组合,买入多头寸和卖出空头寸。

本实验尝试下面的策略:

在观察期的每月末,前面月份的股票收益从高到低排序,选择表现最好的买入,卖出表现最差的

Implement the get_top_n function to get the top performing stock for each month. Get the top performing stocks from prev_returns by assigning them a value of 1. For all other stocks, give them a value of 0. For example, using the following prev_returns:

                                     Previous Returns
               A         B         C         D         E         F         G
2013-07-08     0.015     0.082     0.096     0.020     0.075     0.043     0.074
2013-07-09     0.037     0.095     0.027     0.063     0.024     0.086     0.025
...            ...       ...       ...       ...       ...       ...       ...

The function get_top_n with top_n set to 3 should return the following:

                                     Previous Returns
               A         B         C         D         E         F         G
2013-07-08     0         1         1         0         1         0         0
2013-07-09     0         1         0         1         0         1         0
...            ...       ...       ...       ...       ...       ...       ...

Note: You may have to use Panda's DataFrame.iterrows with Series.nlargest in order to implement the function. This is one of those cases where creating a vecorization solution is too difficult.

In [15]:
def get_top_n(prev_returns, top_n):
    """
    Select the top performing stocks
    
    Parameters
    ----------
    prev_returns : DataFrame
        Previous shifted returns for each ticker and date
    top_n : int
        The number of top performing stocks to get
    
    Returns
    -------
    top_stocks : DataFrame
        Top stocks for each ticker and date marked with a 1
    """
    # TODO: Implement Function
    top_stocks = prev_returns.copy()
    for i,row in prev_returns.iterrows():
        top_stocks.loc[i].loc[row.nlargest(top_n).index] = 1.0
        top_stocks.loc[i].loc[set(row.index) - set(row.nlargest(top_n).index)] = 0

    return top_stocks.astype(np.int)

project_tests.test_get_top_n(get_top_n)
Tests Passed
In [16]:
top_bottom_n = 50
df_long = get_top_n(prev_returns, top_bottom_n)
df_short = get_top_n(-1*prev_returns, top_bottom_n)
print_top(df_long, 'Longed Stocks',top_n=10)
print_top(df_short, 'Shorted Stocks',top_n=10)
10 Most Longed Stocks:
INCY, AMD, AVGO, NFX, SWKS, NFLX, ILMN, UAL, NVDA, MU
10 Most Shorted Stocks:
RRC, FCX, CHK, MRO, GPS, WYNN, DVN, FTI, SPLS, TRIP

预计收益

现在是时候检查交易信号是否有获利的潜力

首先计算组合的净利润。为了简单性,我们假设每只股票投资等量资金,这样每只股票收益的简单算术平均就是投资组合的收益

Implement the portfolio_returns function to compute the expected portfolio returns. Using df_long to indicate which stocks to long and df_short to indicate which stocks to short, calculate the returns using lookahead_returns. To help with calculation, we've provided you with n_stocks as the number of stocks we're investing in a single period.

In [17]:
def portfolio_returns(df_long, df_short, lookahead_returns, n_stocks):
    """
    Compute expected returns for the portfolio, assuming equal investment in each long/short stock.
    
    Parameters
    ----------
    df_long : DataFrame
        Top stocks for each ticker and date marked with a 1
    df_short : DataFrame
        Bottom stocks for each ticker and date marked with a 1
    lookahead_returns : DataFrame
        Lookahead returns for each ticker and date
    n_stocks: int
        The number number of stocks chosen for each month
    
    Returns
    -------
    portfolio_returns : DataFrame
        Expected portfolio returns for each ticker and date
    """
    # TODO: Implement Function
    #df_long - df_short 如果一只股票发出做多和做空信号,则对冲;如果发出做多信号,则收益为正则为正;如果发出做空信息,则收益为负则为正
    #这就是此函数实现的秘密,同时为了求投资组合平均每只股票的收益率,这里提前除以股票数量,后面直接求和即可。
    return (lookahead_returns*(df_long - df_short)) / n_stocks

project_tests.test_portfolio_returns(portfolio_returns)
Tests Passed
In [18]:
expected_portfolio_returns = portfolio_returns(df_long, df_short, lookahead_returns, 2*top_bottom_n)
expected_portfolio_returns.head(10)
Out[18]:
ticker A AAL AAP AAPL ABBV ABC ABT ACN ADBE ADI ... XL XLNX XOM XRAY XRX XYL YUM ZBH ZION ZTS
date
2013-07-31 0.00000000 -0.00000000 -0.00000000 0.00000000 -0.00000000 -0.00000000 -0.00000000 -0.00000000 -0.00000000 -0.00000000 ... -0.00000000 -0.00000000 -0.00000000 -0.00000000 0.00000000 -0.00000000 -0.00000000 -0.00000000 -0.00000000 -0.00000000
2013-08-31 0.00000000 0.00000000 0.00000000 -0.00000000 0.00000000 0.00000000 -0.00000000 0.00000000 0.00000000 0.00000000 ... 0.00000000 0.00000000 -0.00000000 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000 -0.00000000 0.00000000
2013-09-30 -0.00009607 -0.00147346 0.00000000 0.00092019 0.00000000 0.00000000 -0.00100586 0.00000000 0.00000000 0.00000000 ... -0.00000000 -0.00000000 0.00000000 0.00000000 -0.00000000 0.00000000 -0.00000000 0.00000000 0.00000000 0.00000000
2013-10-31 0.00000000 0.00066471 0.00000000 -0.00067721 0.00000000 0.00000000 0.00000000 0.00000000 0.00046134 -0.00000000 ... 0.00000000 -0.00000000 -0.00048996 0.00000000 0.00000000 0.00001738 0.00000000 0.00000000 -0.00033812 -0.00000000
2013-11-30 0.00000000 0.00072677 0.00091973 0.00000000 0.00000000 -0.00000000 0.00000000 0.00000000 0.00000000 0.00000000 ... -0.00000000 -0.00032986 0.00000000 0.00000000 -0.00071819 0.00001157 0.00027009 0.00000000 0.00000000 0.00000000
2013-12-31 0.00000000 0.00000000 0.00000000 -0.00000000 -0.00000000 -0.00000000 -0.00000000 -0.00000000 -0.00000000 -0.00000000 ... -0.00000000 0.00000000 -0.00000000 -0.00000000 -0.00114809 -0.00000000 -0.00113616 0.00000000 -0.00000000 -0.00000000
2014-01-31 -0.00000000 0.00000000 0.00103739 0.00000000 0.00033556 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000 ... 0.00000000 0.00000000 0.00000000 -0.00000000 0.00000000 0.00000000 -0.00098217 -0.00000000 0.00000000 0.00000000
2014-02-28 -0.00000000 -0.00008976 -0.00000000 -0.00019756 0.00000000 -0.00000000 -0.00000000 -0.00000000 -0.00000000 0.00000000 ... -0.00032843 0.00000000 0.00000000 0.00000000 -0.00033529 -0.00000000 -0.00017529 0.00000000 -0.00000000 -0.00000000
2014-03-31 0.00031825 -0.00000000 -0.00000000 0.00000000 0.00000000 -0.00000000 0.00000000 0.00000000 -0.00063586 -0.00000000 ... 0.00000000 -0.00139790 0.00000000 0.00031104 0.00000000 0.00031620 0.00000000 0.00000000 -0.00000000 0.00000000
2014-04-30 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000 ... 0.00000000 0.00000000 -0.00000000 0.00000000 0.00000000 0.00004373 0.00000000 0.00000000 -0.00000000 -0.00014436

10 rows × 495 columns

In [19]:
expected_portfolio_returns.T.sum().head(10)
Out[19]:
date
2013-07-31    0.00000000
2013-08-31    0.00000000
2013-09-30   -0.00236456
2013-10-31   -0.00222744
2013-11-30   -0.00096828
2013-12-31    0.00264429
2014-01-31   -0.00486972
2014-02-28   -0.00761406
2014-03-31   -0.00893045
2014-04-30   -0.00725501
Freq: M, dtype: float64
In [20]:
plot_returns(expected_portfolio_returns.T.sum(), 'Portfolio Returns')

假设检验

T-Test

T检验,亦称student检验,主要用于样本含量较小(例如小于30),总体标准差未知的正太分布。主要用于小样本的两个平均值差异程度的检验方法。它是用T分布理论来推断差异发生的概率,从而判断两个平均数的差异是否显著。

单个样本的T检验:目的是比较样本均值所代表的未知总体均值$\mu$和已知总体的均值$\mu_0$

配对样本T检验:将受试对象的某些重要特征按相近的原则配成对子,目的是消除混杂因素的影响,一对观察对象之间除了处理因素/研究因素之外,其它因素基本齐同,每对中的两个个体随机给予两种处理

具体步骤请参考智库百科

Our null hypothesis ($H_0$) is that the actual mean return from the signal is zero. We'll perform a one-sample, one-sided t-test on the observed mean return, to see if we can reject $H_0$.

We'll need to first compute the t-statistic, and then find its corresponding p-value. The p-value will indicate the probability of observing a mean return equally or more extreme than the one we observed if the null hypothesis were true. A small p-value means that the chance of observing the mean we observed under the null hypothesis is small, and thus casts doubt on the null hypothesis. It's good practice to set a desired level of significance or alpha ($\alpha$) before computing the p-value, and then reject the null hypothesis if $p < \alpha$.

For this project, we'll use $\alpha = 0.05$, since it's a common value to use.

Implement the analyze_alpha function to perform a t-test on the sample of portfolio returns. We've imported the scipy.stats module for you to perform the t-test.

Note: scipy.stats.ttest_1samp performs a two-sided test, so divide the p-value by 2 to get 1-sided p-value

In [21]:
from scipy import stats

def analyze_alpha(expected_portfolio_returns_by_date):
    """
    Perform a t-test with the null hypothesis being that the expected mean return is zero.
    
    Parameters
    ----------
    expected_portfolio_returns_by_date : Pandas Series
        Expected portfolio returns for each date
    
    Returns
    -------
    t_value
        T-statistic from t-test
    p_value
        Corresponding p-value
    """
    # TODO: Implement Function
    #第二个参数表示总体的均值
    t_test_results = stats.ttest_1samp(expected_portfolio_returns_by_date, 0)
    t_value = t_test_results[0]
    p_value = t_test_results[1] / 2

    return t_value, p_value

project_tests.test_analyze_alpha(analyze_alpha)
Tests Passed

年化收益率

In [22]:
expected_portfolio_returns_by_date = expected_portfolio_returns.T.sum().dropna()
portfolio_ret_mean = expected_portfolio_returns_by_date.mean()
portfolio_ret_ste = expected_portfolio_returns_by_date.sem()
#根据对对数收益率推导原始收益率
portfolio_ret_annual_rate = (np.exp(portfolio_ret_mean * 12) - 1) * 100

print("""
Mean:                       {:.6f}
Standard Error:             {:.6f}
Annualized Rate of Return:  {:.2f}%
""".format(portfolio_ret_mean, portfolio_ret_ste, portfolio_ret_annual_rate))
Mean:                       0.003253
Standard Error:             0.002203
Annualized Rate of Return:  3.98%

假设检验的结果

In [23]:
t_value, p_value = analyze_alpha(expected_portfolio_returns_by_date)
print("""
Alpha analysis:
 t-value:        {:.3f}
 p-value:        {:.6f}
""".format(t_value, p_value))
Alpha analysis:
 t-value:        1.476
 p-value:        0.073359

T值1.476对应的P值为0.073359,表明原假设发生的概率为0.073359,大于我们预期的显著性水平0.05,说明3.98%的收益率很可能是随机因素造成的。因此我们接受原假设,也就是该策略的实际收益率为零。

如果设置显著性水平为0.1,则0.073359的P值表明原假设发生的概率比预期显著性水平还要小,我们可以拒接原假设,接受3.98%的收益率是策略正确的结果,但同时我们犯第一类错误的概率会增加,此时会有0.1的概率错误地拒绝了原假设。

In [ ]: