-
-
Notifications
You must be signed in to change notification settings - Fork 13
/
solarhome.py
111 lines (90 loc) · 3.74 KB
/
solarhome.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
#!/usr/bin/python3
# -*- coding: utf-8 -*-
""" Module to read Ausgrid's "Solar Home Electricity" dataset
Pierre Haessig — May 2017
"""
import os.path
import numpy as np
import pandas as pd
csv_fpath = {
'2010-2011': os.path.join('data', 'Solar home 2010-2011.csv'),
'2011-2012': os.path.join('data', 'Solar home 2011-2012.csv'),
'2012-2013': os.path.join('data', 'Solar home 2012-2013.csv'),
}
cache_fpath = {
'2010-2011': os.path.join('data', 'Solar home 2010-2011.pickle.gz'),
'2011-2012': os.path.join('data', 'Solar home 2011-2012.pickle.gz'),
'2012-2013': os.path.join('data', 'Solar home 2012-2013.pickle.gz'),
}
def read_csv(year, use_cache=True):
"""Read original CSV files of the Solar Home Electricity dataset.
Returns a raw DataFrame, that is with same shape as in the CSV file.
It can be further processed by `reshape`.
Notice: due to non ISO timestamp format + dataset reshaping,
it takes about 1 min to read the file. Therefore a automatic caching
mechanism is provided (pickled DataFrame) to accelerate subsequent calls
"""
fpath = csv_fpath[year]
cpath = cache_fpath[year]
if use_cache: # read cached DataFrame
try:
df_raw = pd.read_pickle(cpath, compression='gzip')
return df_raw
except:
print('Unable to load cached file! Fall back to parsing CSV.')
# else parse the original CSV file
df_raw = pd.read_csv(fpath, skiprows=1,
parse_dates=['date'], dayfirst=True,
na_filter=False, dtype={'Row Quality': str})
if use_cache: # save DataFrame for subsequent calls
df_raw.to_pickle(cpath, compression='gzip')
return df_raw
def reshape(df_raw):
"""Reshape the raw DataFrame to a nicer "timeseries-friendly" format:
* columns are customer/channel (using `pandas.MultiIndex`)
* rows are the records for each datetime (regular sampling every 30 minutes)
Returns reshaped DataFrame, missing_records list
"""
# Rows: clean periodic time index
d0, d1 = df_raw.date.min(), df_raw.date.max()
from pandas.tseries.offsets import Day
index = pd.date_range(d0, d1 + Day(1), freq='30T', closed='left')
# Columns
customers = sorted(df_raw.Customer.unique())
channels = ['GC', 'GG', 'CL']
columns = pd.MultiIndex.from_product(
(customers, channels),
names=['Customer', 'Channel'])
# Empty DataFrame with proper MultiIndex structure
empty_cols = pd.MultiIndex(
levels=[customers, channels],
labels=[[],[]],
names=['Customer', 'Channel'])
df = pd.DataFrame(index=index, columns=empty_cols)
# Fill the DataFrame:
missing_records = []
for c in customers:
d_c = df_raw[df_raw.Customer == c]
# TODO: save the row quality
for ch in channels:
d_c_ch = d_c[d_c['Consumption Category'] == ch]
ts = d_c_ch.iloc[:,5:-1].values.ravel()
if len(ts) != len(index):
# TODO: account for incomplete records.
# Especially in 2010-2011: len(ts) is almost often 17155 = 48*357.4 !!
missing_records.append((c,ch, len(ts)))
else:
df[c, ch] = ts
missing_records = pd.DataFrame(missing_records, columns=['Customer', 'Channel', 'data_len'])
# unit conversion (kWh on 30 min) → kW
df *= 2
return df, missing_records
def pv_capacity(df_raw):
"""PV generator capacity of each customer"""
gen_cap_gby = df_raw.groupby('Customer')['Generator Capacity']
assert np.all(gen_cap_gby.nunique() == 1)
return gen_cap_gby.mean()
def postcode(df_raw):
"""Postcode of each customer"""
postcode_gby = df_raw.groupby('Customer')['Postcode']
return postcode_gby.min()