import os
import pandas as pd
from matplotlib import pyplot as plt
import numpy as np
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
import statsmodels.api as sm

from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.stattools import acf, pacf
from statsmodels.api import OLS, add_constant
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KernelDensity
import seaborn as sns


header_list = ['Date', '50Hertz', 'Amprion', 'Tennet', 'Transnet BW']
for year in range(2010, 2021):
    filename = f"energy-charts_Electricity_production_in_Germany_in_{year}.csv"
    filepath = os.path.join('data', filename)
    df_current = pd.read_csv(filepath, sep=',', names=header_list, skiprows=1)
    df_current.fillna(0, inplace=True)

#     df_current["all"] = df_current["50Hertz"] + df_current["Amprion"] + df_current["Tennet"] + df_current["Transnet BW"]
        
    if year == 2010:
        df = df_current
    else:
        df = pd.concat([df, df_current])


df.head()


df["all"] = df["50Hertz"] + df["Amprion"] + df["Tennet"] + df["Transnet BW"]
df['Date'] = pd.to_datetime(df['Date'])
daily_df = df.groupby([df['Date'].dt.date]).mean()[1:]
daily_df.head()


plt.figure(figsize=(30,10))
plt.plot(daily_df['all'])
plt.xlabel('Year', fontsize=30)
plt.ylabel('Daily Solar Power Generation in GW', fontsize=30)
plt.title('Daily Solar Power Generation in Germany', fontsize=40)
plt.savefig('data.png')


sns.displot(daily_df['all'], kde=True)
plt.xlabel('U')
plt.savefig('datadistr.png')


scaler = MinMaxScaler()
scaled = scaler.fit_transform(daily_df['all'].to_numpy().reshape(-1, 1))
epsilon = 0.1
scaled[scaled==0]=epsilon
scaled[scaled==1]=1-epsilon


transformed = np.log(scaled/(1-scaled))
daily_df['transformed'] = transformed
sns.displot(transformed, kde=True, legend=None)
plt.xlabel('$U^*$')
plt.savefig('transformeddistr.png')


plt.plot(daily_df['transformed'])

[<matplotlib.lines.Line2D at 0x22e1b2926d0>]


def get_seasonal_component(timeseries):
    time = np.arange(len(timeseries))
    x_vals = np.array((time, (time**0.25*np.cos(2*np.pi*(time+11)/365)))).T
    lm = LinearRegression().fit(x_vals, timeseries)
    print(f'Intercept: {lm.intercept_}')
    print(f'Coefficients: {lm.coef_}')
    print(f'R-squared: {lm.score(x_vals, timeseries)}')
    return lm.predict(x_vals)


time = np.arange(len(transformed))
x_vals = np.array((time, (time**0.25*np.cos(2*np.pi*(time+11)/365)))).T

model = sm.OLS(transformed, sm.add_constant(x_vals))

results = model.fit()

results.summary()


f = open('ols.tex', 'w')
f.write(results.summary(xname=['beta0', 'beta1', 'beta2']).as_latex())
f.close()


daily_df['seasonality'] = get_seasonal_component(transformed)
daily_df['cleaned'] = daily_df['transformed'] - daily_df['seasonality']

Intercept: [-2.08409933]
Coefficients: [[ 0.00046127 -0.22820751]]
R-squared: 0.7242120048393265


plt.figure(figsize=(15,5))
plt.plot(daily_df['transformed'])
plt.plot(daily_df['seasonality'], c='r', linewidth=3)
plt.legend(['Observed data', 'Regression fit'])
plt.xlabel('Date', fontsize=12)
plt.savefig('seasonality.png')


plt.plot(daily_df['cleaned'])
plt.title('Residual $X_t$', fontsize=15)
plt.xlabel('Date', fontsize=12)
plt.savefig('afterlm.png')


daily_df = daily_df[365:]
daily_df.head()


rmse = np.linalg.norm(daily_df['cleaned'])/np.sqrt(len(daily_df))
print(f'RMSE of residuals: {rmse.round(4)}')

RMSE of residuals: 0.6584


sns.displot(daily_df['cleaned'], kde=True)

<seaborn.axisgrid.FacetGrid at 0x22e1b7145b0>


fig = plot_pacf(daily_df['cleaned'])
plt.xlabel('Lags', fontsize=12)
plt.title('Partial Autocorrelation', fontsize=15)
plt.savefig('pacf.png')


model = ARIMA(daily_df['cleaned'], order=(1,0,1)).fit()

C:\ProgramData\Anaconda3\lib\site-packages\statsmodels\tsa\base\tsa_model.py:524: ValueWarning: No frequency information was provided, so inferred frequency D will be used.
  warnings.warn('No frequency information was'
C:\ProgramData\Anaconda3\lib\site-packages\statsmodels\tsa\base\tsa_model.py:524: ValueWarning: No frequency information was provided, so inferred frequency D will be used.
  warnings.warn('No frequency information was'
C:\ProgramData\Anaconda3\lib\site-packages\statsmodels\tsa\base\tsa_model.py:524: ValueWarning: No frequency information was provided, so inferred frequency D will be used.
  warnings.warn('No frequency information was'


rmse = np.linalg.norm(model.resid)/np.sqrt(len(model.resid))
print(f'RMSE of residuals after ARIMA: {rmse.round(4)}')

RMSE of residuals after ARIMA: 0.5124


model.summary()


f = open('arma.tex', 'w')
f.write(model.summary().as_latex())
f.close()


plt.figure(figsize=(30,10))

# plt.plot(predictions, c='g')
plt.plot(daily_df['cleaned'], c='b')
plt.plot(model.fittedvalues, c='r')
plt.title('Model fit over the whole timespan', fontsize=25)
plt.xlabel('Date', fontsize=18)
plt.legend(['Observed data', 'Model fit'], fontsize=15)
plt.savefig('armafitall.png')


plt.figure(figsize=(30,10))

daily_df['arma_fit'] = model.fittedvalues
# plt.plot(predictions, c='g')
plt.plot(daily_df['cleaned'].iloc[365:365*2], c='b')
plt.plot(daily_df['arma_fit'].iloc[365:365*2], c='r')
plt.title('Model fit in 2012', fontsize=25)
plt.xlabel('Date', fontsize=18)
plt.legend(['Observed data', 'Model fit'], fontsize=15)
plt.savefig('armafit1year.png')


fig = plt.figure(figsize=(20,20))
a = model.plot_diagnostics(fig=fig)
plt.savefig('residplot.png')

C:\ProgramData\Anaconda3\lib\site-packages\statsmodels\graphics\gofplots.py:993: UserWarning: marker is redundantly defined by the 'marker' keyword argument and the fmt string "bo" (-> marker='o'). The keyword argument will take precedence.
  ax.plot(x, y, fmt, **plot_style)

	50Hertz	Amprion	Tennet	Transnet BW	all
Date
2010-01-01	0.004458	0.0	0.048333	0.023208	0.076000
2010-01-02	0.004333	0.0	0.043250	0.020833	0.068417
2010-01-03	0.002167	0.0	0.031625	0.015167	0.048958
2010-01-04	0.001417	0.0	0.088750	0.042667	0.132833
2010-01-05	0.002250	0.0	0.042250	0.020333	0.064833

Dep. Variable:	y	R-squared:	0.724
Model:	OLS	Adj. R-squared:	0.724
Method:	Least Squares	F-statistic:	5272.
Date:	Sun, 07 Aug 2022	Prob (F-statistic):	0.00
Time:	12:57:19	Log-Likelihood:	-4394.5
No. Observations:	4018	AIC:	8795.
Df Residuals:	4015	BIC:	8814.
Df Model:	2
Covariance Type:	nonrobust

	coef	std err	t	P>\|t\|	[0.025	0.975]
const	-2.0841	0.023	-91.413	0.000	-2.129	-2.039
x1	0.0005	9.83e-06	46.923	0.000	0.000	0.000
x2	-0.2282	0.002	-92.181	0.000	-0.233	-0.223

Omnibus:	507.081	Durbin-Watson:	0.620
Prob(Omnibus):	0.000	Jarque-Bera (JB):	1086.897
Skew:	-0.766	Prob(JB):	9.62e-237
Kurtosis:	5.036	Cond. No.	4.64e+03

	50Hertz	Amprion	Tennet	Transnet BW	all	transformed	seasonality	cleaned
Date
2011-01-01	0.011042	0.022042	0.058250	0.009917	0.101250	-4.775061	-2.895385	-1.879675
2011-01-02	0.024542	0.072958	0.090208	0.035667	0.223375	-3.973572	-2.892215	-1.081357
2011-01-03	0.022583	0.080875	0.066042	0.078875	0.248375	-3.865377	-2.888749	-0.976628
2011-01-04	0.019125	0.089500	0.067167	0.067958	0.243750	-3.884564	-2.884988	-0.999576
2011-01-05	0.051042	0.191000	0.205750	0.106500	0.554292	-3.036477	-2.880932	-0.155545

Modelling Power Load of Solar Energy¶

David Schulte¶

Course work: Statistical Tools in Finance and Insurance¶

Prof. Dr. López Cabrera¶

Imports and helper functions¶

Loading the data¶

Distribution of the data¶

Seasonality¶

Residuals after regression¶

Time series modelling¶

	Date	50Hertz	Amprion	Tennet	Transnet BW
0	2009-12-31T23:00:00.000Z	0.0	0.0	0.0	0.0
1	2010-01-01T00:00:00.000Z	0.0	0.0	0.0	0.0
2	2010-01-01T01:00:00.000Z	0.0	0.0	0.0	0.0
3	2010-01-01T02:00:00.000Z	0.0	0.0	0.0	0.0
4	2010-01-01T03:00:00.000Z	0.0	0.0	0.0	0.0

	coef	std err	z	P>\|z\|	[0.025	0.975]
const	0.0593	0.025	2.338	0.019	0.010	0.109
ar.L1	0.6961	0.018	38.043	0.000	0.660	0.732
ma.L1	-0.1208	0.024	-5.031	0.000	-0.168	-0.074
sigma2	0.2621	0.006	43.612	0.000	0.250	0.274

Ljung-Box (L1) (Q):	0.14	Jarque-Bera (JB):	95.96
Prob(Q):	0.71	Prob(JB):	0.00
Heteroskedasticity (H):	1.20	Skew:	-0.37
Prob(H) (two-sided):	0.00	Kurtosis:	3.30

SARIMAX Results
Dep. Variable:	cleaned	No. Observations:	3653
Model:	ARIMA(1, 0, 1)	Log Likelihood	-2738.148
Date:	Sun, 07 Aug 2022	AIC	5484.296
Time:	12:57:23	BIC	5509.109
Sample:	01-01-2011	HQIC	5493.132
	- 12-31-2020
Covariance Type:	opg