-
Notifications
You must be signed in to change notification settings - Fork 2
/
ols.py
211 lines (168 loc) · 7.93 KB
/
ols.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
# http://www.scipy.org/Cookbook/OLS?action=AttachFile&do=get&target=ols.0.2.py
from __future__ import division
from scipy import c_, ones, dot, stats, diff
from scipy.linalg import inv, solve, det
from numpy import log, pi, sqrt, square, diagonal
from numpy.random import randn, seed
import time
class ols:
"""
Author: Vincent Nijs (+ ?)
Email: v-nijs at kellogg.northwestern.edu
Last Modified: Mon Jan 15 17:56:17 CST 2007
Dependencies: See import statement at the top of this file
Doc: Class for multi-variate regression using OLS
For usage examples of other class methods see the class tests at the bottom of this file. To see the class in action
simply run this file using 'python ols.py'. This will generate some simulated data and run various analyses. If you have rpy installed
the same model will also be estimated by R for confirmation.
Input:
y = dependent variable
y_varnm = string with the variable label for y
x = independent variables, note that a constant is added by default
x_varnm = string or list of variable labels for the independent variables
Output:
There are no values returned by the class. Summary provides printed output.
All other measures can be accessed as follows:
Step 1: Create an OLS instance by passing data to the class
m = ols(y,x,y_varnm = 'y',x_varnm = ['x1','x2','x3','x4'])
Step 2: Get specific metrics
To print the coefficients:
>>> print m.b
To print the coefficients p-values:
>>> print m.p
"""
def __init__(self,y,x,y_varnm = 'y',x_varnm = ''):
"""
Initializing the ols class.
"""
self.y = y
self.x = c_[ones(x.shape[0]),x]
self.y_varnm = y_varnm
if not isinstance(x_varnm,list):
self.x_varnm = ['const'] + list(x_varnm)
else:
self.x_varnm = ['const'] + x_varnm
# Estimate model using OLS
self.estimate()
def estimate(self):
# estimating coefficients, and basic stats
self.inv_xx = inv(dot(self.x.T,self.x))
xy = dot(self.x.T,self.y)
self.b = dot(self.inv_xx,xy) # estimate coefficients
self.nobs = self.y.shape[0] # number of observations
self.ncoef = self.x.shape[1] # number of coef.
self.df_e = self.nobs - self.ncoef # degrees of freedom, error
self.df_r = self.ncoef - 1 # degrees of freedom, regression
self.e = self.y - dot(self.x,self.b) # residuals
self.sse = dot(self.e,self.e)/self.df_e # SSE
self.se = sqrt(diagonal(self.sse*self.inv_xx)) # coef. standard errors
self.t = self.b / self.se # coef. t-statistics
self.p = (1-stats.t.cdf(abs(self.t), self.df_e)) * 2 # coef. p-values
self.R2 = 1 - self.e.var()/self.y.var() # model R-squared
self.R2adj = 1-(1-self.R2)*((self.nobs-1)/(self.nobs-self.ncoef)) # adjusted R-square
self.F = (self.R2/self.df_r) / ((1-self.R2)/self.df_e) # model F-statistic
self.Fpv = 1-stats.f.cdf(self.F, self.df_r, self.df_e) # F-statistic p-value
def dw(self):
"""
Calculates the Durbin-Waston statistic
"""
de = diff(self.e,1)
dw = dot(de,de) / dot(self.e,self.e);
return dw
def omni(self):
"""
Omnibus test for normality
"""
return stats.normaltest(self.e)
def JB(self):
"""
Calculate residual skewness, kurtosis, and do the JB test for normality
"""
# Calculate residual skewness and kurtosis
skew = stats.skew(self.e)
kurtosis = 3 + stats.kurtosis(self.e)
# Calculate the Jarque-Bera test for normality
JB = (self.nobs/6) * (square(skew) + (1/4)*square(kurtosis-3))
JBpv = 1-stats.chi2.cdf(JB,2);
return JB, JBpv, skew, kurtosis
def ll(self):
"""
Calculate model log-likelihood and two information criteria
"""
# Model log-likelihood, AIC, and BIC criterion values
ll = -(self.nobs*1/2)*(1+log(2*pi)) - (self.nobs/2)*log(dot(self.e,self.e)/self.nobs)
aic = -2*ll/self.nobs + (2*self.ncoef/self.nobs)
bic = -2*ll/self.nobs + (self.ncoef*log(self.nobs))/self.nobs
return ll, aic, bic
def summary(self):
"""
Printing model output to screen
"""
# local time & date
t = time.localtime()
# extra stats
ll, aic, bic = self.ll()
JB, JBpv, skew, kurtosis = self.JB()
omni, omnipv = self.omni()
# printing output to screen
print '\n=============================================================================='
print "Dependent Variable: " + self.y_varnm
print "Method: Least Squares"
print "Date: ", time.strftime("%a, %d %b %Y",t)
print "Time: ", time.strftime("%H:%M:%S",t)
print '# obs: %5.0f' % self.nobs
print '# variables: %5.0f' % self.ncoef
print '=============================================================================='
print 'variable coefficient std. Error t-statistic prob.'
print '=============================================================================='
for i in range(len(self.x_varnm)):
print '''% -5s % -5.6f % -5.6f % -5.6f % -5.6f''' % tuple([self.x_varnm[i],self.b[i],self.se[i],self.t[i],self.p[i]])
print '=============================================================================='
print 'Models stats Residual stats'
print '=============================================================================='
print 'R-squared % -5.6f Durbin-Watson stat % -5.6f' % tuple([self.R2, self.dw()])
print 'Adjusted R-squared % -5.6f Omnibus stat % -5.6f' % tuple([self.R2adj, omni])
print 'F-statistic % -5.6f Prob(Omnibus stat) % -5.6f' % tuple([self.F, omnipv])
print 'Prob (F-statistic) % -5.6f JB stat % -5.6f' % tuple([self.Fpv, JB])
print 'Log likelihood % -5.6f Prob(JB) % -5.6f' % tuple([ll, JBpv])
print 'AIC criterion % -5.6f Skew % -5.6f' % tuple([aic, skew])
print 'BIC criterion % -5.6f Kurtosis % -5.6f' % tuple([bic, kurtosis])
print '=============================================================================='
if __name__ == '__main__':
##########################
### testing the ols class
##########################
# creating simulated data and variable labels
seed(1)
data = randn(100,5) # the data array
# intercept is added, by default
m = ols(data[:,0],data[:,1:],y_varnm = 'y',x_varnm = ['x1','x2','x3','x4'])
m.summary()
# if you have rpy installed, use it to test the results
have_rpy = False
try:
print "\n"
print "="*30
print "Validating OLS results in R"
print "="*30
import rpy
have_rpy = True
except ImportError:
print "\n"
print "="*30
print "Validating OLS-class results in R"
print "="*30
print "rpy is not installed"
print "="*30
if have_rpy:
y = data[:,0]
x1 = data[:,1]
x2 = data[:,2]
x3 = data[:,3]
x4 = data[:,4]
rpy.set_default_mode(rpy.NO_CONVERSION)
linear_model = rpy.r.lm(rpy.r("y ~ x1 + x2 + x3 + x4"), data = rpy.r.data_frame(x1=x1,x2=x2,x3=x3,x4=x4,y=y))
rpy.set_default_mode(rpy.BASIC_CONVERSION)
print linear_model.as_py()['coefficients']
summary = rpy.r.summary(linear_model)
print summary