Making some fixes

2022-06-07 08:50:03 +01:00
parent 021ba2f291
commit f95c6c2527
10 changed files with 120 additions and 72 deletions
--- a/thesillyhome_src/thesillyhome/.gitignore
+++ b/thesillyhome_src/thesillyhome/.gitignore
--- a/2
+++ b/2
@@ -1,7 +1,6 @@
 FROM python:3.10-slim-bullseye

 COPY thesillyhome_src /thesillyhome_src
-# COPY thesillyhome /thesillyhome


 RUN apt-get update && apt-get install -y curl bash
@@ -14,7 +13,6 @@ RUN \
    nodejs 

 RUN pip3 install -U setuptools && \
-    # pip3 install -r /thesillyhome/requirements.txt &&\ 
    pip3 install -e /thesillyhome_src/thesillyhome/ && \
    pip3 install appdaemon==4.2.1

--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -5,7 +5,7 @@ volumes:
 services:
  thesillyhome:
    container_name: thesillyhome
-    image: lcmchris1/thesillyhome-container:latest
+    image: lcmchris1/thesillyhome-container:0.0.1
    restart: unless-stopped
    volumes:
      - thesillyhome_config:/thesillyhome_src/data/config/
--- a/thesillyhome_src/appdaemon/appdaemon.yaml
+++ b/thesillyhome_src/appdaemon/appdaemon.yaml
@@ -6,5 +6,5 @@ appdaemon:
  plugins:
    HASS:
      type: hass
-      ha_url: <ha_url>
-      token: <ha_token>
+      ha_url: http://192.168.1.100:8123
+      token: eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJpc3MiOiJmODRjZTM2NmQzNTE0Y2UwOTYzMjFjZmZmZGRkZDk2ZSIsImlhdCI6MTY1NDQ5NTIyNSwiZXhwIjoxOTY5ODU1MjI1fQ.wBLKK5xqzUgcHgIEhhZcYsExHKIRB0gZErbDh6RkJhI
--- a/thesillyhome_src/appdaemon/apps/model_executor.py
+++ b/thesillyhome_src/appdaemon/apps/model_executor.py
@@ -6,7 +6,8 @@ from pandas import DataFrame
 from sklearn.tree import DecisionTreeClassifier
 from datetime import datetime
 import copy
-import os
+import os.path
+import logging


 class ModelExecutor(hass.Hass):
@@ -18,36 +19,44 @@ class ModelExecutor(hass.Hass):
        self.log("TheSillyHome has now started!")

    def load_models(self):
+        '''
+        Loads all models to a dictionary
+        '''
        actuators = tsh_config.actuators
        act_model_set = {}
        for act in actuators:
-            if os.isFile(f"/data/model/{self.model_name_version}/{act}.pickle"):
+            if os.path.isfile(f"/thesillyhome_src/data/model/{self.model_name_version}/{act}.pickle"):
                with open(
-                    f"/data/model/{self.model_name_version}/{act}.pickle", "rb"
+                    f"/thesillyhome_src/data/model/{self.model_name_version}/{act}.pickle", "rb"
                ) as pickle_file:
                    content = pickle.load(pickle_file)
                    act_model_set[act] = content
            else:
-                print(f"No model for {act}")
-                act_model_set[act] = None
+                logging.info(f"No model for {act}")
        return act_model_set

    def state_handler(self, entity, attribute, old, new, kwargs):
        sensors = tsh_config.sensors
        float_sensors = tsh_config.float_sensors
+        self.log(f'Received state {entity}')

        if entity in sensors:
-            self.log(f"{entity} is {new}")
+            self.log(f"<----- {entity} is {new} ----->")

            # Get feature list from parsed data header, set all columns to 0
-            feature_list = pd.read_csv("/data/act_states.csv").columns
-            feature_list = feature_list.drop(["entity_id", "state"])
-            feature_list = pd.DataFrame(columns=feature_list)
-            feature_list = feature_list.append(pd.Series(), ignore_index=True)
-            feature_list.iloc[0] = 0
+            feature_list = pd.read_pickle(
+                "/thesillyhome_src/data/parsed/act_states.pkl").columns
+            feature_list = sorted(
+                list(set(feature_list) -
+                     set(['entity_id', 'state', "duplicate"]))
+            )
+            
+            current_state_base = pd.DataFrame(columns=feature_list)
+            current_state_base.loc[len(current_state_base)] = 0
+            print (current_state_base)

-            # Get state of all sensors for model input
-            df_sen_states = copy.deepcopy(feature_list)
+            # Get current state of all sensors for model input
+            df_sen_states = copy.deepcopy(current_state_base)
            for sensor in sensors:
                true_state = self.get_state(entity_id=sensor)
                if sensor not in float_sensors:
@@ -59,10 +68,18 @@ class ModelExecutor(hass.Hass):

            # Execute all models for sensor and set states
            for act, model in self.act_model_set.items():
-                prediction = model.predict(df_sen_states)[0].split("::")[1]
-                if prediction == "on":
+                # the actuators feature state should not affect the model and also the duplicate column
+                cur_act_list = []
+                for feature in feature_list:
+                    if feature.startswith(act):
+                        cur_act_list.append(feature)
+                new_feature_list = sorted(list(set(feature_list) - set(cur_act_list)))
+                df_sen_states_less = df_sen_states[new_feature_list]
+
+                prediction = model.predict(df_sen_states_less)
+                if prediction == 1:
                    self.log(f"Turn on {act}")
                    self.turn_on(act)
-                elif prediction == "off":
+                elif prediction == 0:
                    self.log(f"Turn off {act}")
                    self.turn_off(act)
--- a/thesillyhome_src/thesillyhome/src/thesillyhome/model_creator/home.py
+++ b/thesillyhome_src/thesillyhome/src/thesillyhome/model_creator/home.py
@@ -2,6 +2,8 @@
 from datetime import datetime
 import mysql.connector
 import pandas as pd
+import os.path
+import logging

 # Local application imports
 import thesillyhome.model_creator.read_config_json as tsh_config
@@ -20,7 +22,10 @@ class homedb:
        self.password = tsh_config.db_password
        self.database = tsh_config.db_database

-    def get_data(self):
+    def get_data(self, from_cache=False):
+        if from_cache and os.path.exists(f"{tsh_config.data_dir}/parsed/all_states.pkl"):
+            logging.info("Using cached all_states.pkl")
+            return pd.read_pickle(f"{tsh_config.data_dir}/parsed/all_states.pkl")
        mydb = mysql.connector.connect(
            host=self.host,
            port=self.port,
@@ -36,9 +41,8 @@ class homedb:
        mycursor.execute(query)
        myresult = mycursor.fetchall()
        df = pd.DataFrame.from_dict(myresult)
-        df.to_csv(f"{tsh_config.data_dir}/parsed/all_states.csv")
-        return df

-    def store_data(self, table: str):
-        today = datetime.today().strftime("%Y_%m_%d")
-        self.get_data(table).to_csv(f"{today}_{table}.csv")
+        df.to_csv(f"{tsh_config.data_dir}/parsed/all_states.csv")
+        df.to_pickle(f"{tsh_config.data_dir}/parsed/all_states.pkl")
+
+        return df
--- a/thesillyhome_src/thesillyhome/src/thesillyhome/model_creator/learning_model.py
+++ b/thesillyhome_src/thesillyhome/src/thesillyhome/model_creator/learning_model.py
@@ -7,6 +7,7 @@ from sklearn.tree import DecisionTreeClassifier, export_graphviz
 import numpy as np
 from sklearn.metrics import accuracy_score
 import pickle
+import logging

 # Local application imports
 import thesillyhome.model_creator.read_config_json as tsh_config
@@ -33,36 +34,43 @@ def visualize_tree(tree, actuators, feature_names, model_name_version):
    ]
    try:
        subprocess.check_call(command)
-        os.remove(f"{tsh_config.data_dir}/model/{model_name_version}/{actuator}.dot")
+        os.remove(
+            f"{tsh_config.data_dir}/model/{model_name_version}/{actuator}.dot")
    except:
        exit("Could not run dot, ie graphviz, to produce visualization")


-def train_model(actuators: list, model_name_version):
+def train_model(model_name_version):
+    '''
+    Train models for each actuator
+    '''
+
+    actuators = tsh_config.actuators
+
+    df_act_states = pd.read_pickle(
+        f"{tsh_config.data_dir}/parsed/act_states.pkl"
+    )
+    df_act_states = df_act_states.reset_index(drop=True)

    # Generate feature and output vectors from act states.
-    df_act_states = pd.read_csv(
-        f"{tsh_config.data_dir}/parsed/act_states.csv", index_col=False
-    ).drop(columns=["index"])
-
-    df_act_states = df_act_states.replace([np.inf, -np.inf], np.nan)
-    df_act_states = df_act_states.fillna(999)
-
    output_list = tsh_config.output_list.copy()
    act_list = list(set(df_act_states.columns) - set(output_list))

    for actuator in actuators:
-        print(f"Training model for {actuator}")
+        logging.info(f"Training model for {actuator}")

        df_act = df_act_states[df_act_states["entity_id"] == actuator]
-        df_act = df_act.reset_index(drop=True)

        if df_act.empty:
-            print(f"No cases found for {actuator}")
+            logging.info(f"No cases found for {actuator}")
            continue
-        output_vector = np.where(df_act["state"] == "on", 1, 0)

-        # the actuators state should not affect the model
+        '''
+        Setting output and feature vector
+        '''
+        output_vector = df_act["state"]
+
+        # the actuators feature state should not affect the model and also the duplicate column
        cur_act_list = []
        for feature in act_list:
            if feature.startswith(actuator):
@@ -78,20 +86,19 @@ def train_model(actuators: list, model_name_version):
        )

        # # Weighting more recent observations more. 3 times if in top 50 percent
-        # sample_weight = np.ones(len(X_train))
-        # sample_weight[: int(len(sample_weight) * 0.2)] = 3
+        sample_weight = np.ones(len(X_train))
+        sample_weight[: int(len(sample_weight) * 0.2)] = 3

        # Weighting duplicates less
-        sample_weight = X_train["duplicate"]
-
-        X_train = X_train.drop(columns="duplicate")
-        X_test = X_test.drop(columns="duplicate")
+        sample_weight = sample_weight * X_train['duplicate']
+        X_train = X_train.drop(columns='duplicate')
+        X_test = X_test.drop(columns='duplicate')
+        y_train = y_train.drop(columns='duplicate')
+        y_test = y_test.drop(columns='duplicate')

        model_tree = DecisionTreeClassifier(random_state=99)
        model_tree.fit(X_train, y_train, sample_weight=sample_weight)

-        feature_list.remove("duplicate")
-
        # Visualization of tress:
        # tree_to_code(model_tree, feature_list)
        # visualize_tree(model_tree, feature_list)
@@ -100,7 +107,7 @@ def train_model(actuators: list, model_name_version):
        y_tree_predictions = model_tree.predict(X_test)

        # Extract predictions for each output variable and calculate accuracy and f1 score
-        print(
+        logging.info(
            f"{actuator} accuracy score: {accuracy_score(y_test, y_tree_predictions) * 100}"
        )

@@ -110,4 +117,4 @@ def train_model(actuators: list, model_name_version):

        filename = open(f"{model_directory}/{actuator}.pickle", "wb")
        pickle.dump(model_tree, filename)
-    print("Completed!")
+    logging.info("Completed!")
--- a/thesillyhome_src/thesillyhome/src/thesillyhome/model_creator/main.py
+++ b/thesillyhome_src/thesillyhome/src/thesillyhome/model_creator/main.py
@@ -3,12 +3,12 @@
 import thesillyhome.model_creator.read_config_json as tsh_config
 from thesillyhome.model_creator.parse_data import parse_data_from_db
 from thesillyhome.model_creator.learning_model import train_model
-
+import logging
 if __name__ == "__main__":
-
+    logging.getLogger().setLevel(logging.INFO)
    actuators = tsh_config.actuators
    sensors = tsh_config.sensors
    model_name_version = tsh_config.model_name_version
    tsh_config.replace_yaml()
-    parse_data_from_db(actuators, sensors)
-    train_model(actuators, model_name_version)
+    parse_data_from_db()
+    train_model(model_name_version)
--- a/thesillyhome_src/thesillyhome/src/thesillyhome/model_creator/parse_data.py
+++ b/thesillyhome_src/thesillyhome/src/thesillyhome/model_creator/parse_data.py
@@ -8,6 +8,7 @@ import tqdm
 from tqdm import tqdm
 import numpy as np
 from multiprocessing import cpu_count
+import logging

 # Local application imports
 from thesillyhome.model_creator.home import homedb
@@ -28,16 +29,24 @@ def parallelize_dataframe(df1, df2, devices, func):


 def add_device_states(df_output: pd.DataFrame, df_states: pd.DataFrame, devices, pbar):
+    '''
+    Convert dataframe to:
+    act_state, last_state, sen_state1, sen_state2...
+
+    1) add last_state for entity_id
+    2) add duplicate state
+    3) add latest states of all sensors 
+    '''

    for index, row in df_output.iterrows():
-
-        # Add last_state non-float onto dataframe as trigger
+        # get last states (for non-float for now)
        last_device_state = df_states[
            (df_states["entity_id"].isin(devices))
            & (df_states["entity_id"] != row["entity_id"])
            & (df_states["last_changed"] < row["last_changed"])
            & ~(df_states["entity_id"].isin(tsh_config.float_sensors))
        ]
+
        if not last_device_state.empty:
            df_output.loc[
                index, "last_state"
@@ -45,12 +54,12 @@ def add_device_states(df_output: pd.DataFrame, df_states: pd.DataFrame, devices,
        else:
            df_output.loc[index, "last_state"] = np.NaN

+        # Value actual state changes more!
        last_current_device_state = df_states[
            (df_states["entity_id"] == row["entity_id"])
            & (df_states["last_changed"] < row["last_changed"])
        ]

-        # Value actual state changes more!
        if not last_current_device_state.empty:
            if last_current_device_state["state"].iloc[0] == row["state"]:
                df_output.loc[index, "duplicate"] = 1
@@ -67,7 +76,8 @@ def add_device_states(df_output: pd.DataFrame, df_states: pd.DataFrame, devices,
            ]

            if not previous_device_state.empty:
-                df_output.loc[index, device] = previous_device_state["state"].iloc[0]
+                df_output.loc[index,
+                              device] = previous_device_state["state"].iloc[0]
            else:
                if device in tsh_config.float_sensors:
                    df_output.loc[index, device] = 0
@@ -77,7 +87,6 @@ def add_device_states(df_output: pd.DataFrame, df_states: pd.DataFrame, devices,
    return df_output


-# Hot encoding for all features
 def one_hot_encoder(df: DataFrame, column: str) -> DataFrame:
    one_hot = pd.get_dummies(df[column], prefix=column)
    df = df.drop(column, axis=1)
@@ -107,28 +116,32 @@ def convert_unavailabe(df: DataFrame) -> DataFrame:
    return df


-def parse_data_from_db(actuators: list, sensors: list):
+def parse_data_from_db():
    """
    Our data base currently stores by events.
    To create a valid ML classification case, we will parse all last
    sensor states for each actuator event and append it to the dataframe.
    """
+    actuators = tsh_config.actuators
+    sensors = tsh_config.sensors

-    print("Reading from homedb...")
+    logging.info("Reading from homedb...")
    df_all = homedb().get_data()
    df_all = convert_unavailabe(df_all)
-    assert ~df_all["state"].isnull().values.any(), df_all[df_all["state"].isnull()]
+    assert ~df_all["state"].isnull().values.any(
+    ), df_all[df_all["state"].isnull()]

-    print("Add previous state...")
+    logging.info("Add previous state...")
    devices = actuators + sensors
    df_states = df_all[df_all["entity_id"].isin(devices)]
    df_act_states = df_all[df_all["entity_id"].isin(actuators)]

    df_output = copy.deepcopy(df_act_states)

-    print("Start parallelization processing...")
+    logging.info("Start parallelization processing...")

-    df_output = parallelize_dataframe(df_output, df_states, devices, add_device_states)
+    df_output = parallelize_dataframe(
+        df_output, df_states, devices, add_device_states)

    """
    Code to add one hot encoding for date time.
@@ -140,24 +153,31 @@ def parse_data_from_db(actuators: list, sensors: list):
    )
    df_output = df_output.drop(columns=["last_changed"])

+    '''
+    feature list extraction
+    '''
    output_list = tsh_config.output_list.copy()
    output_list.append("duplicate")
    feature_list = sorted(list(set(df_output.columns) - set(output_list)))

+    '''
+    Hot encoding for all columns bar float_sensors which has int format
+    '''
    float_sensors = tsh_config.float_sensors
    for feature in feature_list:
-        # For float sensors, these are already in Int format so no encoding.
        if feature not in float_sensors:
            df_output = one_hot_encoder(df_output, feature)

-    # Remove some empty entity_id rows
-    df_output = df_output[df_output["entity_id"] != ""]
+    '''
+    Output and checks
+    '''
+    df_output["state"] = np.where(df_output["state"] == "on", 1, 0)

-    assert ~df_output.isnull().values.any(), df_output[
-        df_output["sensor.corridor_entrance_sensor_illuminance_lux"].isnull()
-    ]
+    assert df_output[df_output["entity_id"] == ""].empty
+    assert ~df_output.isnull().values.any()
    assert ~df_output.isin([np.inf, -np.inf, np.nan]).values.any()

    df_output.to_csv(
        f"{tsh_config.data_dir}/parsed/act_states.csv", index=True, index_label="index"
    )
+    df_output.to_pickle(f"{tsh_config.data_dir}/parsed/act_states.pkl")
--- a/thesillyhome_src/thesillyhome/src/thesillyhome/model_creator/read_config_json.py
+++ b/thesillyhome_src/thesillyhome/src/thesillyhome/model_creator/read_config_json.py
@@ -62,7 +62,9 @@ def extract_float_sensors(sensors: list):

 float_sensors = extract_float_sensors(sensors)

+output_list_og = ["entity_id", "state"]
 output_list = ["entity_id", "state", "last_changed"]
+output_list_dup = ["entity_id", "state", "last_changed", "duplicate"]


 def replace_yaml():