import pandas as pd
import os
import tempfile
import zipfile
import glob
from tqdm import tqdm
import math
import requests
#colour==0.1.5
#cvxpy==1.0.3
#cycler==0.10.0
#numpy==1.13.3
#pandas==0.21.1
#plotly==2.2.3
#pyparsing==2.2.0
#python-dateutil==2.6.1
#pytz==2017.3
#requests==2.18.4
#scipy==1.0.0
#scikit-learn==0.19.1
#six==1.11.0
#tqdm==4.19.5
color_scheme = {
'index': '#B6B2CF',
'etf': '#2D3ECF',
'tracking_error': '#6F91DE',
'df_header': 'silver',
'df_value': 'white',
'df_line': 'silver',
'heatmap_colorscale': [(0, '#6F91DE'), (0.5, 'grey'), (1, 'red')],
'background_label': '#9dbdd5',
'low_value': '#B6B2CF',
'high_value': '#2D3ECF',
'y_axis_2_text_color': 'grey',
'shadow': 'rgba(0, 0, 0, 0.75)',
'major_line': '#2D3ECF',
'minor_line': '#B6B2CF',
'main_line': 'black'}
def download_quandl_dataset(quandl_api_key, database, dataset, save_path, columns, tickers, start_date, end_date):
"""
Download a dataset from Quandl and save it to `save_path`.
Filter by columns, tickers, and date
:param quandl_api_key: The Quandl API key
:param database: The Quandl database to download from
:param dataset: The dataset to download
:param save_path: The path to save the dataset
:param columns: The columns to save
:param tickers: The tickers to save
:param start_date: The rows to save that are older than this date
:param end_date: The rows to save that are younger than this date
"""
scrape_url = 'https://www.quandl.com/api/v3/datatables/{}/{}?qopts.export=true&api_key={}'\
.format(database, dataset, quandl_api_key)
scrape_request = requests.get(scrape_url)
bulk_download_url = scrape_request.json()['datatable_bulk_download']['file']['link']
with tempfile.TemporaryDirectory() as tmp_dir:
tmp_wiki_file = tmp_dir + 'tmp.zip'
bulk_download_request = requests.get(bulk_download_url, stream=True, cookies=scrape_request.cookies)
total_size = int(bulk_download_request.headers.get('content-length', 0));
block_size = 1024 * 1024
with open(tmp_wiki_file, 'wb') as f:
for data in tqdm(
bulk_download_request.iter_content(block_size),
total=math.ceil(total_size // block_size),
unit='MB',
unit_scale=True,
desc='Downloading Data'):
f.write(data)
with tqdm(total=5, desc='Transforming Data', unit='Action') as pbar:
# Unzip downloaded data
zip_ref = zipfile.ZipFile(tmp_wiki_file, 'r')
zip_ref.extractall(tmp_dir)
zip_ref.close()
pbar.update(1)
# Check if the zip file only contains one csv file
# We're assuming that Quandl will always give us the data in a single csv file.
# If it's different, we want to throw an error.
csv_files = glob.glob(os.path.join(tmp_dir, '*.csv'))
assert len(csv_files) == 1,\
'Bulk download of Quandl Wiki data failed. Wrong number of csv files found. Found {} file(s).'\
.format(len(csv_files))
tmp_csv_file = csv_files[0]
tmp_df = pd.read_csv(tmp_csv_file)
pbar.update(1)
tmp_df['date'] = pd.to_datetime(tmp_df['date'])
pbar.update(1)
# Remove unused data and save
tmp_df = tmp_df[tmp_df['date'].isin(pd.date_range(start_date, end_date))] # Filter unused dates
tmp_df = tmp_df[tmp_df['ticker'].isin(tickers)] # Filter unused tickers
pbar.update(1)
tmp_df.to_csv(save_path, columns=columns, index=False) # Filter unused columns and save
pbar.update(1)
def generate_config():
return {'showLink': False, 'displayModeBar': False, 'showAxisRangeEntryBoxes': True}
import plotly.graph_objs as go
import plotly.offline as offline_py
offline_py.init_notebook_mode(connected=True)
def _generate_stock_trace(prices):
return go.Scatter(
name='Index',
x=prices.index,
y=prices,
line={'color': color_scheme['major_line']})
def _generate_traces(name_df_color_data):
traces = []
for name, df, color in name_df_color_data:
traces.append(go.Scatter(
name=name,
x=df.index,
y=df,
mode='line',
line={'color': color}))
return traces
def print_dataframe(df, n_rows=10, n_columns=3):
missing_val_str = '...'
config = generate_config()
formatted_df = df.iloc[:n_rows, :n_columns]
formatted_df = formatted_df.applymap('{:.3f}'.format)
if len(df.columns) > n_columns:
formatted_df[missing_val_str] = [missing_val_str]*len(formatted_df.index)
if len(df.index) > n_rows:
formatted_df.loc[missing_val_str] = [missing_val_str]*len(formatted_df.columns)
trace = go.Table(
type='table',
columnwidth=[1, 3],
header={
'values': [''] + list(formatted_df.columns.values),
'line': {'color': helper.color_scheme['df_line']},
'fill': {'color': helper.color_scheme['df_header']},
'font': {'size': 13}},
cells={
'values': formatted_df.reset_index().values.T,
'line': {'color': color_scheme['df_line']},
'fill': {'color': [color_scheme['df_header'], color_scheme['df_value']]},
'font': {'size': 13}})
offline_py.iplot([trace], config=config)
def plot_stock(prices, title):
config = generate_config()
layout = go.Layout(title=title)
stock_trace = _generate_stock_trace(prices)
offline_py.iplot({'data': [stock_trace], 'layout': layout}, config=config)
def plot_resampled_prices(df_resampled, df, title):
config = generate_config()
layout = go.Layout(title=title)
traces = _generate_traces([
('Monthly Close', df_resampled, color_scheme['major_line']),
('Close', df, color_scheme['minor_line'])])
offline_py.iplot({'data': traces, 'layout': layout}, config=config)
def plot_returns(returns, title):
config = generate_config()
layout = go.Layout(title=title)
traces = _generate_traces([
('Returns', returns, color_scheme['major_line'])])
offline_py.iplot({'data': traces, 'layout': layout}, config=config)
def plot_shifted_returns(df_shited, df, title):
config = generate_config()
layout = go.Layout(title=title)
traces = _generate_traces([
('Shifted Returns', df_shited, color_scheme['major_line']),
('Returns', df, color_scheme['minor_line'])])
offline_py.iplot({'data': traces, 'layout': layout}, config=config)
def print_top(df, name, top_n=10):
print('{} Most {}:'.format(top_n, name))
print(', '.join(df.sum().sort_values(ascending=False).index[:top_n].values.tolist()))
import pandas as pd
import numpy as np
import project_tests
df = pd.read_csv('eod-quotemedia.csv', parse_dates=['date'], index_col=False)
df.head(10)
close = df.reset_index().pivot(index='date', columns='ticker', values='adj_close')
close.head(10)
apple_ticker = 'AAPL'
plot_stock(close[apple_ticker], '{} Stock'.format(apple_ticker))
本项目使用月收盘价格每月交易一次,所以需要按照月重新抽样每月的收盘价
def resample_prices(close_prices, freq='M'):
"""
Resample close prices for each ticker at specified frequency.
Parameters
----------
close_prices : DataFrame
Close prices for each ticker and date
freq : str
What frequency to sample at
For valid freq choices, see http://pandas.pydata.org/pandas-docs/stable/timeseries.html#offset-aliases
Returns
-------
prices_resampled : DataFrame
Resampled prices for each ticker and date
"""
# TODO: Implement Function
#return close_prices.groupby(pd.Grouper(freq=freq)).last()
return close_prices.resample(freq).last()
project_tests.test_resample_prices(resample_prices)
monthly_close = resample_prices(close)
plot_resampled_prices(
monthly_close.loc[:, apple_ticker],
close.loc[:, apple_ticker],
'{} Stock - Close Vs Monthly Close'.format(apple_ticker))
log return $= R = log_e(\frac{P_t}{P_{t-1}}) = log_e(P_t) - log_e(P_{t-1})$
raw return $= r = \frac{{P_t}-P_{t-1}}{P_{t-1}}$
$R = log_e(r+1)$
$r = e^R -1$
根据连续复利的推倒公式可知股票的对数收益R实际上就是连续复利回报利率
def compute_log_returns(prices):
"""
Compute log returns for each ticker.
Parameters
----------
prices : DataFrame
Prices for each ticker and date
Returns
-------
log_returns : DataFrame
Log returns for each ticker and date
"""
# TODO: Implement Function
return np.log(prices) - np.log(prices.shift(1))
project_tests.test_compute_log_returns(compute_log_returns)
monthly_close_returns = compute_log_returns(monthly_close)
plot_returns(
monthly_close_returns.loc[:, apple_ticker],
'Log Returns of {} Stock (Monthly)'.format(apple_ticker))
实现函数shift_returns
在时间序列上移动收益到以前或者未来的时间,例如当shift_n
为2时,收益数据像下面这样子:
Returns
A B C D
2013-07-08 0.015 0.082 0.096 0.020 ...
2013-07-09 0.037 0.095 0.027 0.063 ...
2013-07-10 0.094 0.001 0.093 0.019 ...
2013-07-11 0.092 0.057 0.069 0.087 ...
... ... ... ... ...
the output of the shift_returns
function would be:
Shift Returns
A B C D
2013-07-08 NaN NaN NaN NaN ...
2013-07-09 NaN NaN NaN NaN ...
2013-07-10 0.015 0.082 0.096 0.020 ...
2013-07-11 0.037 0.095 0.027 0.063 ...
... ... ... ... ...
Using the same returns
data as above, the shift_returns
function should generate the following with shift_n
as -2:
Shift Returns
A B C D
2013-07-08 0.094 0.001 0.093 0.019 ...
2013-07-09 0.092 0.057 0.069 0.087 ...
... ... ... ... ... ...
... ... ... ... ... ...
... NaN NaN NaN NaN ...
... NaN NaN NaN NaN ...
Note: The "..." represents data points we're not showing.
def shift_returns(returns, shift_n):
"""
Generate shifted returns
Parameters
----------
returns : DataFrame
Returns for each ticker and date
shift_n : int
Number of periods to move, can be positive or negative
Returns
-------
shifted_returns : DataFrame
Shifted returns for each ticker and date
"""
# TODO: Implement Function
return returns.shift(shift_n)
project_tests.test_shift_returns(shift_returns)
#前一个月的收益
prev_returns = shift_returns(monthly_close_returns, 1)
#下一个月的收益
lookahead_returns = shift_returns(monthly_close_returns, -1)
plot_shifted_returns(
prev_returns.loc[:, apple_ticker],
monthly_close_returns.loc[:, apple_ticker],
'Previous Returns of {} Stock'.format(apple_ticker))
plot_shifted_returns(
lookahead_returns.loc[:, apple_ticker],
monthly_close_returns.loc[:, apple_ticker],
'Lookahead Returns of {} Stock'.format(apple_ticker))
交易信号是一序列的交易动作或者可以交易的结果。一个共同的形式是每天产生长短头寸结合的投资组合(例如每月月末,或者其他你想要的交易频率)。在交易日可以根据交易信息重新平衡投资组合,买入多头寸和卖出空头寸。
本实验尝试下面的策略:
在观察期的每月末,前面月份的股票收益从高到低排序,选择表现最好的买入,卖出表现最差的
Implement the get_top_n
function to get the top performing stock for each month. Get the top performing stocks from prev_returns
by assigning them a value of 1. For all other stocks, give them a value of 0. For example, using the following prev_returns
:
Previous Returns
A B C D E F G
2013-07-08 0.015 0.082 0.096 0.020 0.075 0.043 0.074
2013-07-09 0.037 0.095 0.027 0.063 0.024 0.086 0.025
... ... ... ... ... ... ... ...
The function get_top_n
with top_n
set to 3 should return the following:
Previous Returns
A B C D E F G
2013-07-08 0 1 1 0 1 0 0
2013-07-09 0 1 0 1 0 1 0
... ... ... ... ... ... ... ...
Note: You may have to use Panda's DataFrame.iterrows
with Series.nlargest
in order to implement the function. This is one of those cases where creating a vecorization solution is too difficult.
def get_top_n(prev_returns, top_n):
"""
Select the top performing stocks
Parameters
----------
prev_returns : DataFrame
Previous shifted returns for each ticker and date
top_n : int
The number of top performing stocks to get
Returns
-------
top_stocks : DataFrame
Top stocks for each ticker and date marked with a 1
"""
# TODO: Implement Function
top_stocks = prev_returns.copy()
for i,row in prev_returns.iterrows():
top_stocks.loc[i].loc[row.nlargest(top_n).index] = 1.0
top_stocks.loc[i].loc[set(row.index) - set(row.nlargest(top_n).index)] = 0
return top_stocks.astype(np.int)
project_tests.test_get_top_n(get_top_n)
top_bottom_n = 50
df_long = get_top_n(prev_returns, top_bottom_n)
df_short = get_top_n(-1*prev_returns, top_bottom_n)
print_top(df_long, 'Longed Stocks',top_n=10)
print_top(df_short, 'Shorted Stocks',top_n=10)
现在是时候检查交易信号是否有获利的潜力
首先计算组合的净利润。为了简单性,我们假设每只股票投资等量资金,这样每只股票收益的简单算术平均就是投资组合的收益
Implement the portfolio_returns
function to compute the expected portfolio returns. Using df_long
to indicate which stocks to long and df_short
to indicate which stocks to short, calculate the returns using lookahead_returns
. To help with calculation, we've provided you with n_stocks
as the number of stocks we're investing in a single period.
def portfolio_returns(df_long, df_short, lookahead_returns, n_stocks):
"""
Compute expected returns for the portfolio, assuming equal investment in each long/short stock.
Parameters
----------
df_long : DataFrame
Top stocks for each ticker and date marked with a 1
df_short : DataFrame
Bottom stocks for each ticker and date marked with a 1
lookahead_returns : DataFrame
Lookahead returns for each ticker and date
n_stocks: int
The number number of stocks chosen for each month
Returns
-------
portfolio_returns : DataFrame
Expected portfolio returns for each ticker and date
"""
# TODO: Implement Function
#df_long - df_short 如果一只股票发出做多和做空信号,则对冲;如果发出做多信号,则收益为正则为正;如果发出做空信息,则收益为负则为正
#这就是此函数实现的秘密,同时为了求投资组合平均每只股票的收益率,这里提前除以股票数量,后面直接求和即可。
return (lookahead_returns*(df_long - df_short)) / n_stocks
project_tests.test_portfolio_returns(portfolio_returns)
expected_portfolio_returns = portfolio_returns(df_long, df_short, lookahead_returns, 2*top_bottom_n)
expected_portfolio_returns.head(10)
expected_portfolio_returns.T.sum().head(10)
plot_returns(expected_portfolio_returns.T.sum(), 'Portfolio Returns')
T检验,亦称student检验,主要用于样本含量较小(例如小于30),总体标准差未知的正太分布。主要用于小样本的两个平均值差异程度的检验方法。它是用T分布理论来推断差异发生的概率,从而判断两个平均数的差异是否显著。
单个样本的T检验:目的是比较样本均值所代表的未知总体均值$\mu$和已知总体的均值$\mu_0$
配对样本T检验:将受试对象的某些重要特征按相近的原则配成对子,目的是消除混杂因素的影响,一对观察对象之间除了处理因素/研究因素之外,其它因素基本齐同,每对中的两个个体随机给予两种处理
具体步骤请参考智库百科
Our null hypothesis ($H_0$) is that the actual mean return from the signal is zero. We'll perform a one-sample, one-sided t-test on the observed mean return, to see if we can reject $H_0$.
We'll need to first compute the t-statistic, and then find its corresponding p-value. The p-value will indicate the probability of observing a mean return equally or more extreme than the one we observed if the null hypothesis were true. A small p-value means that the chance of observing the mean we observed under the null hypothesis is small, and thus casts doubt on the null hypothesis. It's good practice to set a desired level of significance or alpha ($\alpha$) before computing the p-value, and then reject the null hypothesis if $p < \alpha$.
For this project, we'll use $\alpha = 0.05$, since it's a common value to use.
Implement the analyze_alpha
function to perform a t-test on the sample of portfolio returns. We've imported the scipy.stats
module for you to perform the t-test.
Note: scipy.stats.ttest_1samp
performs a two-sided test, so divide the p-value by 2 to get 1-sided p-value
from scipy import stats
def analyze_alpha(expected_portfolio_returns_by_date):
"""
Perform a t-test with the null hypothesis being that the expected mean return is zero.
Parameters
----------
expected_portfolio_returns_by_date : Pandas Series
Expected portfolio returns for each date
Returns
-------
t_value
T-statistic from t-test
p_value
Corresponding p-value
"""
# TODO: Implement Function
#第二个参数表示总体的均值
t_test_results = stats.ttest_1samp(expected_portfolio_returns_by_date, 0)
t_value = t_test_results[0]
p_value = t_test_results[1] / 2
return t_value, p_value
project_tests.test_analyze_alpha(analyze_alpha)
expected_portfolio_returns_by_date = expected_portfolio_returns.T.sum().dropna()
portfolio_ret_mean = expected_portfolio_returns_by_date.mean()
portfolio_ret_ste = expected_portfolio_returns_by_date.sem()
#根据对对数收益率推导原始收益率
portfolio_ret_annual_rate = (np.exp(portfolio_ret_mean * 12) - 1) * 100
print("""
Mean: {:.6f}
Standard Error: {:.6f}
Annualized Rate of Return: {:.2f}%
""".format(portfolio_ret_mean, portfolio_ret_ste, portfolio_ret_annual_rate))
t_value, p_value = analyze_alpha(expected_portfolio_returns_by_date)
print("""
Alpha analysis:
t-value: {:.3f}
p-value: {:.6f}
""".format(t_value, p_value))
T值1.476对应的P值为0.073359,表明原假设发生的概率为0.073359,大于我们预期的显著性水平0.05,说明3.98%的收益率很可能是随机因素造成的。因此我们接受原假设,也就是该策略的实际收益率为零。
如果设置显著性水平为0.1,则0.073359的P值表明原假设发生的概率比预期显著性水平还要小,我们可以拒接原假设,接受3.98%的收益率是策略正确的结果,但同时我们犯第一类错误的概率会增加,此时会有0.1的概率错误地拒绝了原假设。