In [ ]:
!pip install numpy==1.17.3
!pip install statsmodels
!pip install pmdarima
!pip install awswrangler
!pip install ipython-sql
In [43]:
import awswrangler as wr
import json

# connect to Redshift cluster
data = {}
with open('connection.json') as f:
  data = json.load(f)

engine = wr.db.get_engine(
        db_type="redshift",
        host=data['host_name'],
        port=data['port_num'],
        database=data['db_name'],
        user=data['username'],
        password=data['password']
)

results = wr.db.read_sql_query('select current_user, version();', con=engine)
results
Out[43]:
current_user version
0 awsuser PostgreSQL 8.0.2 on i686-pc-linux-gnu, compile...
In [2]:
query_create_schema = '''
CREATE EXTERNAL SCHEMA IF NOT EXISTS ds_lab
FROM DATA CATALOG DATABASE 'default' 
IAM_ROLE 'arn:aws:iam::###account_id###:role/###redshift_role###' 
CREATE EXTERNAL DATABASE IF NOT EXISTS;
'''

with engine.connect() as con:
    con.execute(query_create_schema)

Then create an external table via Redshift QueryEditor using sample sales data.

CREATE external TABLE ds_lab.sales_raw(
  invoiceno varchar(16), 
  stockcode varchar(16), 
  description varchar(128), 
  quantity varchar(16), 
  invoicedate varchar(32), 
  unitprice varchar(16), 
  customerid varchar(16), 
  country varchar(32)
)
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde'
WITH SERDEPROPERTIES (
  'serialization.format' = ',',
  'field.delim' = ','
)
LOCATION
  's3://redshift-demos/data/sales_forecasting/raw_csv/'
TABLE PROPERTIES (
  'skip.header.line.count'='1');
In [10]:
results = wr.db.read_sql_query('select * from ds_lab.sales_raw limit 5;', con=engine)
results
Out[10]:
invoiceno stockcode description quantity invoicedate unitprice customerid country
0 536365 85123A WHITE HANGING HEART T-LIGHT HOLDER 6 12/1/2010 8:26 2.55 17850 United Kingdom
1 536365 71053 WHITE METAL LANTERN 6 12/1/2010 8:26 3.39 17850 United Kingdom
2 536365 84406B CREAM CUPID HEARTS COAT HANGER 8 12/1/2010 8:26 2.75 17850 United Kingdom
3 536365 84029G KNITTED UNION FLAG HOT WATER BOTTLE 6 12/1/2010 8:26 3.39 17850 United Kingdom
4 536365 84029E RED WOOLLY HOTTIE WHITE HEART. 6 12/1/2010 8:26 3.39 17850 United Kingdom
In [14]:
query_create_table = '''
create table public.sales_clean as
    select invoiceno, stockcode, TO_DATE(invoicedate, 'MM/DD/YYYY HH24:MI') as invoicedate,
        cast(quantity as bigint) as quantity, 
        cast(unitprice as float) as unitprice, 
        country from ds_lab.sales_raw 
    where 
    trim(unitprice) not like '0'
    and stockcode not in ('B', 'D', 'M', 'm', 'S');
'''
with engine.connect().execution_options(autocommit=True) as con:
    con.execute(query_create_table)
In [15]:
results = wr.db.read_sql_query('select * from public.sales_clean limit 5;', con=engine)
results
Out[15]:
invoiceno stockcode invoicedate quantity unitprice country
0 536365 71053 2010-12-01 6 3.39 United Kingdom
1 536365 84029G 2010-12-01 6 3.39 United Kingdom
2 536365 22752 2010-12-01 2 7.65 United Kingdom
3 536366 22633 2010-12-01 6 1.85 United Kingdom
4 536367 84879 2010-12-01 32 1.69 United Kingdom
In [21]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

daily_sales = '''select date_trunc('day', invoicedate) as invoicedate, sum(quantity * unitprice) as total_sales from public.sales_clean group by 1 order by 1;'''
df = wr.db.read_sql_query(daily_sales, con=engine)
result = df.set_index('invoicedate')
result = result.resample('1D').sum()

traindata = np.trim_zeros(result.iloc[:,0], trim='f')
train = traindata[50:330]
test = traindata[330:]

result.plot(figsize=(15,8))
Out[21]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f9bb003c4a8>
In [23]:
from pylab import rcParams
import statsmodels.api as sm

rcParams['figure.figsize'] = 15, 8
decomposition = sm.tsa.seasonal_decompose(result, model='additive')
fig = decomposition.plot()
plt.show()
In [24]:
import pmdarima as pm
from statsmodels.tsa.arima_model import ARIMA
from random import random

smodel = pm.auto_arima(train, start_p=1, start_q=1,
                         test='adf',
                         max_p=3, max_q=3, m=12,
                         start_P=0, seasonal=True,
                         d=None, D=1, trace=False,
                         error_action='ignore',  
                         suppress_warnings=True, 
                         stepwise=True)

smodel.summary()
Out[24]:
SARIMAX Results
Dep. Variable: y No. Observations: 280
Model: SARIMAX(0, 0, 2)x(0, 1, 2, 12) Log Likelihood -2991.867
Date: Tue, 30 Jun 2020 AIC 5995.734
Time: 20:33:04 BIC 6017.280
Sample: 0 HQIC 6004.388
- 280
Covariance Type: opg
coef std err z P>|z| [0.025 0.975]
intercept 1930.6793 435.814 4.430 0.000 1076.500 2784.859
ma.L1 0.3839 0.069 5.555 0.000 0.248 0.519
ma.L2 0.1002 0.067 1.486 0.137 -0.032 0.232
ma.S.L12 -1.0623 0.075 -14.131 0.000 -1.210 -0.915
ma.S.L24 0.2558 0.079 3.228 0.001 0.100 0.411
sigma2 3.124e+08 0.002 1.81e+11 0.000 3.12e+08 3.12e+08
Ljung-Box (Q): 405.68 Jarque-Bera (JB): 25.27
Prob(Q): 0.00 Prob(JB): 0.00
Heteroskedasticity (H): 2.37 Skew: 0.26
Prob(H) (two-sided): 0.00 Kurtosis: 4.41


Warnings:
[1] Covariance matrix calculated using the outer product of gradients (complex-step).
[2] Covariance matrix is singular or near-singular, with condition number 1.21e+27. Standard errors may be unstable.
In [28]:
import matplotlib.pyplot as plt

n_periods = 30
fitted, confint = smodel.predict(n_periods=n_periods, return_conf_int=True)
index_of_fc = pd.date_range(train.index[-1], periods = n_periods, freq='D')

fitted_series = pd.Series(fitted, index=index_of_fc)
lower_series = pd.Series(confint[:, 0], index=index_of_fc)
upper_series = pd.Series(confint[:, 1], index=index_of_fc)

# Plot
plt.figure(figsize=(12,5), dpi=100)
plt.plot(train[:], label='training')
plt.plot(test[:30], label='actual')
plt.plot(fitted_series, color='darkgreen', label='forecast')
plt.fill_between(lower_series.index, 
                 lower_series, 
                 upper_series, 
                 color='k', alpha=.15)

plt.legend(loc='upper left', fontsize=8)
plt.title("Forecast of sales vs actual")
plt.show()
In [38]:
from statsmodels.tsa.statespace.sarimax import SARIMAX

prodcode = '22633'
daily_prod_sales = '''select date_trunc('day', invoicedate) as invoicedate, sum(quantity * unitprice) as total_sales from public.sales_clean where stockcode = '%s' group by 1 order by 1;'''
df = wr.db.read_sql_query(daily_prod_sales % prodcode, con=engine)
df_prod = df.set_index('invoicedate')
df_prod = df_prod.resample('1D').sum()

forecast_period = 30
prod_data = df_prod['total_sales']
train_prod = prod_data[:]
forecast_index = pd.date_range(start=df_prod.index[df_prod.size-1], freq='1D', periods=forecast_period)

my_order = (0,0,2)
my_seasonal_order = (0, 1, 2, 12)
model = SARIMAX(train_prod, order=my_order, seasonal_order=my_seasonal_order)

# fit model and run forecast
model_fit = model.fit()
fc = model_fit.forecast(forecast_period, alpha=0.05)
fc_series = pd.Series(fc, index=forecast_index)

fig = plt.figure(figsize=(12,5), dpi=100)
plt.plot(train_prod[df_prod.size-45:], label='history')
plt.plot(fc_series[:], label='forecast')
plt.title('%d days Forecast for product %s' % (forecast_period, prodcode))
plt.legend(loc='upper left', fontsize=8)
Out[38]:
<matplotlib.legend.Legend at 0x7f9b7f200400>
In [ ]: