Making some fixes

This commit is contained in:
Chris Lai
2022-06-07 08:50:03 +01:00
parent 021ba2f291
commit f95c6c2527
10 changed files with 120 additions and 72 deletions

View File

@@ -1,7 +1,6 @@
FROM python:3.10-slim-bullseye
COPY thesillyhome_src /thesillyhome_src
# COPY thesillyhome /thesillyhome
RUN apt-get update && apt-get install -y curl bash
@@ -14,7 +13,6 @@ RUN \
nodejs
RUN pip3 install -U setuptools && \
# pip3 install -r /thesillyhome/requirements.txt &&\
pip3 install -e /thesillyhome_src/thesillyhome/ && \
pip3 install appdaemon==4.2.1

View File

@@ -5,7 +5,7 @@ volumes:
services:
thesillyhome:
container_name: thesillyhome
image: lcmchris1/thesillyhome-container:latest
image: lcmchris1/thesillyhome-container:0.0.1
restart: unless-stopped
volumes:
- thesillyhome_config:/thesillyhome_src/data/config/

View File

@@ -6,5 +6,5 @@ appdaemon:
plugins:
HASS:
type: hass
ha_url: <ha_url>
token: <ha_token>
ha_url: http://192.168.1.100:8123
token: eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJpc3MiOiJmODRjZTM2NmQzNTE0Y2UwOTYzMjFjZmZmZGRkZDk2ZSIsImlhdCI6MTY1NDQ5NTIyNSwiZXhwIjoxOTY5ODU1MjI1fQ.wBLKK5xqzUgcHgIEhhZcYsExHKIRB0gZErbDh6RkJhI

View File

@@ -6,7 +6,8 @@ from pandas import DataFrame
from sklearn.tree import DecisionTreeClassifier
from datetime import datetime
import copy
import os
import os.path
import logging
class ModelExecutor(hass.Hass):
@@ -18,36 +19,44 @@ class ModelExecutor(hass.Hass):
self.log("TheSillyHome has now started!")
def load_models(self):
'''
Loads all models to a dictionary
'''
actuators = tsh_config.actuators
act_model_set = {}
for act in actuators:
if os.isFile(f"/data/model/{self.model_name_version}/{act}.pickle"):
if os.path.isfile(f"/thesillyhome_src/data/model/{self.model_name_version}/{act}.pickle"):
with open(
f"/data/model/{self.model_name_version}/{act}.pickle", "rb"
f"/thesillyhome_src/data/model/{self.model_name_version}/{act}.pickle", "rb"
) as pickle_file:
content = pickle.load(pickle_file)
act_model_set[act] = content
else:
print(f"No model for {act}")
act_model_set[act] = None
logging.info(f"No model for {act}")
return act_model_set
def state_handler(self, entity, attribute, old, new, kwargs):
sensors = tsh_config.sensors
float_sensors = tsh_config.float_sensors
self.log(f'Received state {entity}')
if entity in sensors:
self.log(f"{entity} is {new}")
self.log(f"<----- {entity} is {new} ----->")
# Get feature list from parsed data header, set all columns to 0
feature_list = pd.read_csv("/data/act_states.csv").columns
feature_list = feature_list.drop(["entity_id", "state"])
feature_list = pd.DataFrame(columns=feature_list)
feature_list = feature_list.append(pd.Series(), ignore_index=True)
feature_list.iloc[0] = 0
feature_list = pd.read_pickle(
"/thesillyhome_src/data/parsed/act_states.pkl").columns
feature_list = sorted(
list(set(feature_list) -
set(['entity_id', 'state', "duplicate"]))
)
current_state_base = pd.DataFrame(columns=feature_list)
current_state_base.loc[len(current_state_base)] = 0
print (current_state_base)
# Get state of all sensors for model input
df_sen_states = copy.deepcopy(feature_list)
# Get current state of all sensors for model input
df_sen_states = copy.deepcopy(current_state_base)
for sensor in sensors:
true_state = self.get_state(entity_id=sensor)
if sensor not in float_sensors:
@@ -59,10 +68,18 @@ class ModelExecutor(hass.Hass):
# Execute all models for sensor and set states
for act, model in self.act_model_set.items():
prediction = model.predict(df_sen_states)[0].split("::")[1]
if prediction == "on":
# the actuators feature state should not affect the model and also the duplicate column
cur_act_list = []
for feature in feature_list:
if feature.startswith(act):
cur_act_list.append(feature)
new_feature_list = sorted(list(set(feature_list) - set(cur_act_list)))
df_sen_states_less = df_sen_states[new_feature_list]
prediction = model.predict(df_sen_states_less)
if prediction == 1:
self.log(f"Turn on {act}")
self.turn_on(act)
elif prediction == "off":
elif prediction == 0:
self.log(f"Turn off {act}")
self.turn_off(act)

View File

@@ -2,6 +2,8 @@
from datetime import datetime
import mysql.connector
import pandas as pd
import os.path
import logging
# Local application imports
import thesillyhome.model_creator.read_config_json as tsh_config
@@ -20,7 +22,10 @@ class homedb:
self.password = tsh_config.db_password
self.database = tsh_config.db_database
def get_data(self):
def get_data(self, from_cache=False):
if from_cache and os.path.exists(f"{tsh_config.data_dir}/parsed/all_states.pkl"):
logging.info("Using cached all_states.pkl")
return pd.read_pickle(f"{tsh_config.data_dir}/parsed/all_states.pkl")
mydb = mysql.connector.connect(
host=self.host,
port=self.port,
@@ -36,9 +41,8 @@ class homedb:
mycursor.execute(query)
myresult = mycursor.fetchall()
df = pd.DataFrame.from_dict(myresult)
df.to_csv(f"{tsh_config.data_dir}/parsed/all_states.csv")
return df
def store_data(self, table: str):
today = datetime.today().strftime("%Y_%m_%d")
self.get_data(table).to_csv(f"{today}_{table}.csv")
df.to_csv(f"{tsh_config.data_dir}/parsed/all_states.csv")
df.to_pickle(f"{tsh_config.data_dir}/parsed/all_states.pkl")
return df

View File

@@ -7,6 +7,7 @@ from sklearn.tree import DecisionTreeClassifier, export_graphviz
import numpy as np
from sklearn.metrics import accuracy_score
import pickle
import logging
# Local application imports
import thesillyhome.model_creator.read_config_json as tsh_config
@@ -33,36 +34,43 @@ def visualize_tree(tree, actuators, feature_names, model_name_version):
]
try:
subprocess.check_call(command)
os.remove(f"{tsh_config.data_dir}/model/{model_name_version}/{actuator}.dot")
os.remove(
f"{tsh_config.data_dir}/model/{model_name_version}/{actuator}.dot")
except:
exit("Could not run dot, ie graphviz, to produce visualization")
def train_model(actuators: list, model_name_version):
def train_model(model_name_version):
'''
Train models for each actuator
'''
actuators = tsh_config.actuators
df_act_states = pd.read_pickle(
f"{tsh_config.data_dir}/parsed/act_states.pkl"
)
df_act_states = df_act_states.reset_index(drop=True)
# Generate feature and output vectors from act states.
df_act_states = pd.read_csv(
f"{tsh_config.data_dir}/parsed/act_states.csv", index_col=False
).drop(columns=["index"])
df_act_states = df_act_states.replace([np.inf, -np.inf], np.nan)
df_act_states = df_act_states.fillna(999)
output_list = tsh_config.output_list.copy()
act_list = list(set(df_act_states.columns) - set(output_list))
for actuator in actuators:
print(f"Training model for {actuator}")
logging.info(f"Training model for {actuator}")
df_act = df_act_states[df_act_states["entity_id"] == actuator]
df_act = df_act.reset_index(drop=True)
if df_act.empty:
print(f"No cases found for {actuator}")
logging.info(f"No cases found for {actuator}")
continue
output_vector = np.where(df_act["state"] == "on", 1, 0)
# the actuators state should not affect the model
'''
Setting output and feature vector
'''
output_vector = df_act["state"]
# the actuators feature state should not affect the model and also the duplicate column
cur_act_list = []
for feature in act_list:
if feature.startswith(actuator):
@@ -78,20 +86,19 @@ def train_model(actuators: list, model_name_version):
)
# # Weighting more recent observations more. 3 times if in top 50 percent
# sample_weight = np.ones(len(X_train))
# sample_weight[: int(len(sample_weight) * 0.2)] = 3
sample_weight = np.ones(len(X_train))
sample_weight[: int(len(sample_weight) * 0.2)] = 3
# Weighting duplicates less
sample_weight = X_train["duplicate"]
X_train = X_train.drop(columns="duplicate")
X_test = X_test.drop(columns="duplicate")
sample_weight = sample_weight * X_train['duplicate']
X_train = X_train.drop(columns='duplicate')
X_test = X_test.drop(columns='duplicate')
y_train = y_train.drop(columns='duplicate')
y_test = y_test.drop(columns='duplicate')
model_tree = DecisionTreeClassifier(random_state=99)
model_tree.fit(X_train, y_train, sample_weight=sample_weight)
feature_list.remove("duplicate")
# Visualization of tress:
# tree_to_code(model_tree, feature_list)
# visualize_tree(model_tree, feature_list)
@@ -100,7 +107,7 @@ def train_model(actuators: list, model_name_version):
y_tree_predictions = model_tree.predict(X_test)
# Extract predictions for each output variable and calculate accuracy and f1 score
print(
logging.info(
f"{actuator} accuracy score: {accuracy_score(y_test, y_tree_predictions) * 100}"
)
@@ -110,4 +117,4 @@ def train_model(actuators: list, model_name_version):
filename = open(f"{model_directory}/{actuator}.pickle", "wb")
pickle.dump(model_tree, filename)
print("Completed!")
logging.info("Completed!")

View File

@@ -3,12 +3,12 @@
import thesillyhome.model_creator.read_config_json as tsh_config
from thesillyhome.model_creator.parse_data import parse_data_from_db
from thesillyhome.model_creator.learning_model import train_model
import logging
if __name__ == "__main__":
logging.getLogger().setLevel(logging.INFO)
actuators = tsh_config.actuators
sensors = tsh_config.sensors
model_name_version = tsh_config.model_name_version
tsh_config.replace_yaml()
parse_data_from_db(actuators, sensors)
train_model(actuators, model_name_version)
parse_data_from_db()
train_model(model_name_version)

View File

@@ -8,6 +8,7 @@ import tqdm
from tqdm import tqdm
import numpy as np
from multiprocessing import cpu_count
import logging
# Local application imports
from thesillyhome.model_creator.home import homedb
@@ -28,16 +29,24 @@ def parallelize_dataframe(df1, df2, devices, func):
def add_device_states(df_output: pd.DataFrame, df_states: pd.DataFrame, devices, pbar):
'''
Convert dataframe to:
act_state, last_state, sen_state1, sen_state2...
1) add last_state for entity_id
2) add duplicate state
3) add latest states of all sensors
'''
for index, row in df_output.iterrows():
# Add last_state non-float onto dataframe as trigger
# get last states (for non-float for now)
last_device_state = df_states[
(df_states["entity_id"].isin(devices))
& (df_states["entity_id"] != row["entity_id"])
& (df_states["last_changed"] < row["last_changed"])
& ~(df_states["entity_id"].isin(tsh_config.float_sensors))
]
if not last_device_state.empty:
df_output.loc[
index, "last_state"
@@ -45,12 +54,12 @@ def add_device_states(df_output: pd.DataFrame, df_states: pd.DataFrame, devices,
else:
df_output.loc[index, "last_state"] = np.NaN
# Value actual state changes more!
last_current_device_state = df_states[
(df_states["entity_id"] == row["entity_id"])
& (df_states["last_changed"] < row["last_changed"])
]
# Value actual state changes more!
if not last_current_device_state.empty:
if last_current_device_state["state"].iloc[0] == row["state"]:
df_output.loc[index, "duplicate"] = 1
@@ -67,7 +76,8 @@ def add_device_states(df_output: pd.DataFrame, df_states: pd.DataFrame, devices,
]
if not previous_device_state.empty:
df_output.loc[index, device] = previous_device_state["state"].iloc[0]
df_output.loc[index,
device] = previous_device_state["state"].iloc[0]
else:
if device in tsh_config.float_sensors:
df_output.loc[index, device] = 0
@@ -77,7 +87,6 @@ def add_device_states(df_output: pd.DataFrame, df_states: pd.DataFrame, devices,
return df_output
# Hot encoding for all features
def one_hot_encoder(df: DataFrame, column: str) -> DataFrame:
one_hot = pd.get_dummies(df[column], prefix=column)
df = df.drop(column, axis=1)
@@ -107,28 +116,32 @@ def convert_unavailabe(df: DataFrame) -> DataFrame:
return df
def parse_data_from_db(actuators: list, sensors: list):
def parse_data_from_db():
"""
Our data base currently stores by events.
To create a valid ML classification case, we will parse all last
sensor states for each actuator event and append it to the dataframe.
"""
actuators = tsh_config.actuators
sensors = tsh_config.sensors
print("Reading from homedb...")
logging.info("Reading from homedb...")
df_all = homedb().get_data()
df_all = convert_unavailabe(df_all)
assert ~df_all["state"].isnull().values.any(), df_all[df_all["state"].isnull()]
assert ~df_all["state"].isnull().values.any(
), df_all[df_all["state"].isnull()]
print("Add previous state...")
logging.info("Add previous state...")
devices = actuators + sensors
df_states = df_all[df_all["entity_id"].isin(devices)]
df_act_states = df_all[df_all["entity_id"].isin(actuators)]
df_output = copy.deepcopy(df_act_states)
print("Start parallelization processing...")
logging.info("Start parallelization processing...")
df_output = parallelize_dataframe(df_output, df_states, devices, add_device_states)
df_output = parallelize_dataframe(
df_output, df_states, devices, add_device_states)
"""
Code to add one hot encoding for date time.
@@ -140,24 +153,31 @@ def parse_data_from_db(actuators: list, sensors: list):
)
df_output = df_output.drop(columns=["last_changed"])
'''
feature list extraction
'''
output_list = tsh_config.output_list.copy()
output_list.append("duplicate")
feature_list = sorted(list(set(df_output.columns) - set(output_list)))
'''
Hot encoding for all columns bar float_sensors which has int format
'''
float_sensors = tsh_config.float_sensors
for feature in feature_list:
# For float sensors, these are already in Int format so no encoding.
if feature not in float_sensors:
df_output = one_hot_encoder(df_output, feature)
# Remove some empty entity_id rows
df_output = df_output[df_output["entity_id"] != ""]
'''
Output and checks
'''
df_output["state"] = np.where(df_output["state"] == "on", 1, 0)
assert ~df_output.isnull().values.any(), df_output[
df_output["sensor.corridor_entrance_sensor_illuminance_lux"].isnull()
]
assert df_output[df_output["entity_id"] == ""].empty
assert ~df_output.isnull().values.any()
assert ~df_output.isin([np.inf, -np.inf, np.nan]).values.any()
df_output.to_csv(
f"{tsh_config.data_dir}/parsed/act_states.csv", index=True, index_label="index"
)
df_output.to_pickle(f"{tsh_config.data_dir}/parsed/act_states.pkl")

View File

@@ -62,7 +62,9 @@ def extract_float_sensors(sensors: list):
float_sensors = extract_float_sensors(sensors)
output_list_og = ["entity_id", "state"]
output_list = ["entity_id", "state", "last_changed"]
output_list_dup = ["entity_id", "state", "last_changed", "duplicate"]
def replace_yaml():