Logo

Autoregressive Moving Average (ARMA): Sunspots dataΒΆ

Link to Notebook GitHub

In [1]:
from __future__ import print_function
import numpy as np
from scipy import stats
import pandas as pd
import matplotlib.pyplot as plt

import statsmodels.api as sm
In [2]:
from statsmodels.graphics.api import qqplot

Sunpots Data

In [3]:
print(sm.datasets.sunspots.NOTE)
::

    Number of Observations - 309 (Annual 1700 - 2008)
    Number of Variables - 1
    Variable name definitions::

        SUNACTIVITY - Number of sunspots for each year

    The data file contains a 'YEAR' variable that is not returned by load.


In [4]:
dta = sm.datasets.sunspots.load_pandas().data
In [5]:
dta.index = pd.Index(sm.tsa.datetools.dates_from_range('1700', '2008'))
del dta["YEAR"]
In [6]:
dta.plot(figsize=(12,8));
In [7]:
fig = plt.figure(figsize=(12,8))
ax1 = fig.add_subplot(211)
fig = sm.graphics.tsa.plot_acf(dta.values.squeeze(), lags=40, ax=ax1)
ax2 = fig.add_subplot(212)
fig = sm.graphics.tsa.plot_pacf(dta, lags=40, ax=ax2)
In [8]:
arma_mod20 = sm.tsa.ARMA(dta, (2,0)).fit()
print(arma_mod20.params)
const                49.659498
ar.L1.SUNACTIVITY     1.390657
ar.L2.SUNACTIVITY    -0.688572
dtype: float64

In [9]:
arma_mod30 = sm.tsa.ARMA(dta, (3,0)).fit()
In [10]:
print(arma_mod20.aic, arma_mod20.bic, arma_mod20.hqic)
2622.63633807 2637.56970317 2628.60672591

In [11]:
print(arma_mod30.params)
const                49.749980
ar.L1.SUNACTIVITY     1.300810
ar.L2.SUNACTIVITY    -0.508093
ar.L3.SUNACTIVITY    -0.129649
dtype: float64

In [12]:
print(arma_mod30.aic, arma_mod30.bic, arma_mod30.hqic)
2619.4036287 2638.07033508 2626.8666135

  • Does our model obey the theory?
In [13]:
sm.stats.durbin_watson(arma_mod30.resid.values)
Out[13]:
1.9564810325209219
In [14]:
fig = plt.figure(figsize=(12,8))
ax = fig.add_subplot(111)
ax = arma_mod30.resid.plot(ax=ax);
In [15]:
resid = arma_mod30.resid
In [16]:
stats.normaltest(resid)
Out[16]:
(49.845005822356519, 1.5007021697683078e-11)
In [17]:
fig = plt.figure(figsize=(12,8))
ax = fig.add_subplot(111)
fig = qqplot(resid, line='q', ax=ax, fit=True)
In [18]:
fig = plt.figure(figsize=(12,8))
ax1 = fig.add_subplot(211)
fig = sm.graphics.tsa.plot_acf(resid.values.squeeze(), lags=40, ax=ax1)
ax2 = fig.add_subplot(212)
fig = sm.graphics.tsa.plot_pacf(resid, lags=40, ax=ax2)
In [19]:
r,q,p = sm.tsa.acf(resid.values.squeeze(), qstat=True)
data = np.c_[range(1,41), r[1:], q, p]
table = pd.DataFrame(data, columns=['lag', "AC", "Q", "Prob(>Q)"])
print(table.set_index('lag'))
           AC          Q      Prob(>Q)
lag
1    0.009179   0.026286  8.712045e-01
2    0.041793   0.573045  7.508703e-01
3   -0.001335   0.573604  9.024476e-01
4    0.136089   6.408921  1.706203e-01
5    0.092468   9.111823  1.046862e-01
6    0.091948  11.793235  6.674367e-02
7    0.068748  13.297191  6.519005e-02
8   -0.015020  13.369219  9.976163e-02
9    0.187592  24.641900  3.393922e-03
10   0.213718  39.321983  2.229485e-05
11   0.201082  52.361126  2.344961e-07
12   0.117182  56.804180  8.574289e-08
13  -0.014055  56.868316  1.893909e-07
14   0.015398  56.945555  3.997672e-07
15  -0.024967  57.149311  7.741494e-07
16   0.080916  59.296758  6.872193e-07
17   0.041138  59.853726  1.110949e-06
18  -0.052021  60.747417  1.548438e-06
19   0.062496  62.041681  1.831650e-06
20  -0.010301  62.076969  3.381255e-06
21   0.074453  63.926644  3.193596e-06
22   0.124955  69.154761  8.978385e-07
23   0.093162  72.071026  5.799799e-07
24  -0.082152  74.346679  4.713030e-07
25   0.015695  74.430034  8.289065e-07
26  -0.025037  74.642893  1.367288e-06
27  -0.125861  80.041143  3.722572e-07
28   0.053225  81.009976  4.716286e-07
29  -0.038693  81.523803  6.916641e-07
30  -0.016904  81.622221  1.151662e-06
31  -0.019296  81.750933  1.868768e-06
32   0.104990  85.575061  8.927965e-07
33   0.040086  86.134563  1.247510e-06
34   0.008829  86.161806  2.047827e-06
35   0.014588  86.236444  3.263810e-06
36  -0.119329  91.248895  1.084455e-06
37  -0.036665  91.723863  1.521924e-06
38  -0.046193  92.480513  1.938735e-06
39  -0.017768  92.592882  2.990679e-06
40  -0.006220  92.606705  4.696984e-06

[40 rows x 3 columns]

  • This indicates a lack of fit.
  • In-sample dynamic prediction. How good does our model do?
In [20]:
predict_sunspots = arma_mod30.predict('1990', '2012', dynamic=True)
print(predict_sunspots)
1990-12-31    167.047423
1991-12-31    140.993017
1992-12-31     94.859145
1993-12-31     46.860952
1994-12-31     11.242657
1995-12-31     -4.721209
1996-12-31     -1.166825
1997-12-31     16.185768
1998-12-31     39.021942
1999-12-31     59.449910
2000-12-31     72.170164
2001-12-31     75.376796
2002-12-31     70.436470
2003-12-31     60.731607
2004-12-31     50.201830
2005-12-31     42.076074
2006-12-31     38.114345
2007-12-31     38.454705
2008-12-31     41.963875
2009-12-31     46.869339
2010-12-31     51.423303
2011-12-31     54.399752
2012-12-31     55.321720
Freq: A-DEC, dtype: float64

In [21]:
fig, ax = plt.subplots(figsize=(12, 8))
ax = dta.ix['1950':].plot(ax=ax)
fig = arma_mod30.plot_predict('1990', '2012', dynamic=True, ax=ax, plot_insample=False)
In [22]:
def mean_forecast_err(y, yhat):
    return y.sub(yhat).mean()
In [23]:
mean_forecast_err(dta.SUNACTIVITY, predict_sunspots)
Out[23]:
5.636913445759844

Exercise: Can you obtain a better fit for the Sunspots model? (Hint: sm.tsa.AR has a method select_order)

Simulated ARMA(4,1): Model Identification is Difficult

In [24]:
from statsmodels.tsa.arima_process import arma_generate_sample, ArmaProcess
In [25]:
np.random.seed(1234)
# include zero-th lag
arparams = np.array([1, .75, -.65, -.55, .9])
maparams = np.array([1, .65])

Let's make sure this model is estimable.

In [26]:
arma_t = ArmaProcess(arparams, maparams)
In [27]:
arma_t.isinvertible()
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-443-d3a1a0e5898b> in <module>()
----> 1 arma_t.isinvertible()

TypeError: 'bool' object is not callable
In [28]:
arma_t.isstationary()
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-444-55a9b2cc43b1> in <module>()
----> 1 arma_t.isstationary()

TypeError: 'bool' object is not callable
  • What does this mean?
In [29]:
fig = plt.figure(figsize=(12,8))
ax = fig.add_subplot(111)
ax.plot(arma_t.generate_sample(size=50));
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-445-d059f8971c1a> in <module>()
      1 fig = plt.figure(figsize=(12,8))
      2 ax = fig.add_subplot(111)
----> 3 ax.plot(arma_t.generate_sample(size=50));

TypeError: generate_sample() got an unexpected keyword argument 'size'
In [30]:
arparams = np.array([1, .35, -.15, .55, .1])
maparams = np.array([1, .65])
arma_t = ArmaProcess(arparams, maparams)
arma_t.isstationary()
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-446-317f1b2ac56b> in <module>()
      2 maparams = np.array([1, .65])
      3 arma_t = ArmaProcess(arparams, maparams)
----> 4 arma_t.isstationary()

TypeError: 'bool' object is not callable
In [31]:
arma_rvs = arma_t.generate_sample(size=500, burnin=250, scale=2.5)
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-447-e0a3cc13cb6e> in <module>()
----> 1 arma_rvs = arma_t.generate_sample(size=500, burnin=250, scale=2.5)

TypeError: generate_sample() got an unexpected keyword argument 'size'
In [32]:
fig = plt.figure(figsize=(12,8))
ax1 = fig.add_subplot(211)
fig = sm.graphics.tsa.plot_acf(arma_rvs, lags=40, ax=ax1)
ax2 = fig.add_subplot(212)
fig = sm.graphics.tsa.plot_pacf(arma_rvs, lags=40, ax=ax2)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-448-8e761b44cfae> in <module>()
      1 fig = plt.figure(figsize=(12,8))
      2 ax1 = fig.add_subplot(211)
----> 3 fig = sm.graphics.tsa.plot_acf(arma_rvs, lags=40, ax=ax1)
      4 ax2 = fig.add_subplot(212)
      5 fig = sm.graphics.tsa.plot_pacf(arma_rvs, lags=40, ax=ax2)

NameError: name 'arma_rvs' is not defined
  • For mixed ARMA processes the Autocorrelation function is a mixture of exponentials and damped sine waves after (q-p) lags.
  • The partial autocorrelation function is a mixture of exponentials and dampened sine waves after (p-q) lags.
In [33]:
arma11 = sm.tsa.ARMA(arma_rvs, (1,1)).fit()
resid = arma11.resid
r,q,p = sm.tsa.acf(resid, qstat=True)
data = np.c_[range(1,41), r[1:], q, p]
table = pd.DataFrame(data, columns=['lag', "AC", "Q", "Prob(>Q)"])
print(table.set_index('lag'))
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-449-03653831c71c> in <module>()
----> 1 arma11 = sm.tsa.ARMA(arma_rvs, (1,1)).fit()
      2 resid = arma11.resid
      3 r,q,p = sm.tsa.acf(resid, qstat=True)
      4 data = np.c_[range(1,41), r[1:], q, p]
      5 table = pd.DataFrame(data, columns=['lag', "AC", "Q", "Prob(>Q)"])

NameError: name 'arma_rvs' is not defined
In [34]:
arma41 = sm.tsa.ARMA(arma_rvs, (4,1)).fit()
resid = arma41.resid
r,q,p = sm.tsa.acf(resid, qstat=True)
data = np.c_[range(1,41), r[1:], q, p]
table = pd.DataFrame(data, columns=['lag', "AC", "Q", "Prob(>Q)"])
print(table.set_index('lag'))
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-450-30d9c2f35894> in <module>()
----> 1 arma41 = sm.tsa.ARMA(arma_rvs, (4,1)).fit()
      2 resid = arma41.resid
      3 r,q,p = sm.tsa.acf(resid, qstat=True)
      4 data = np.c_[range(1,41), r[1:], q, p]
      5 table = pd.DataFrame(data, columns=['lag', "AC", "Q", "Prob(>Q)"])

NameError: name 'arma_rvs' is not defined

Exercise: How good of in-sample prediction can you do for another series, say, CPI

In [35]:
macrodta = sm.datasets.macrodata.load_pandas().data
macrodta.index = pd.Index(sm.tsa.datetools.dates_from_range('1959Q1', '2009Q3'))
cpi = macrodta["cpi"]

Hint:

In [36]:
fig = plt.figure(figsize=(12,8))
ax = fig.add_subplot(111)
ax = cpi.plot(ax=ax);
ax.legend();

P-value of the unit-root test, resoundly rejects the null of no unit-root.

In [37]:
print(sm.tsa.adfuller(cpi)[1])
0.990432818834

This Page