In this study we analyze three years of data, which include environmental parameters from nearby weather stations, information about the sites where traps are located, and the total summer counts of the pest fly spotted-wing drosophila in each trap. We try to use machine learning models to determine if we can predict whether a site is "high risk" (fly totals above the median), or "low risk" (fly totals below the median).
This dataset is likely too small to draw conclusive results, but it is a good case study.
!pip install silence_tensorflow
import numpy as np # for linear algebra
from sklearn import preprocessing # for machine learning
import pandas as pd # for dataframes
import silence_tensorflow.auto # to silence warning messages from tf
import tensorflow as tf # for ML models
import os # for file paths
from google.colab import drive
drive.mount('/content/drive')
#Load dataframe
raw_csv_data = pd.read_csv("/content/drive/My Drive/SWD Count Data/SWD_traps_HR_Wasco_171819.csv")
raw_csv_data.head()
# Create a copy of the raw dataset to work on
df = raw_csv_data.copy()
# Inspect dataframe information
df.info()
# Make 'year' categorical
df["year"] = df["year"].astype("category", copy = False)
# Fill NAs with means from the same town and year, only for numerical variables
df.groupby(['town', 'year'])
for column in df:
if df[column].dtype == np.float64:
df[column].fillna(df.groupby(['town', 'year'])[column].transform('mean'), inplace=True)
df[column].fillna(df[column].mean(), inplace=True)
# Drop unnecessary columns. axis = 1 indicate that these are column names, not rows
df = df.drop(["trap_ID", "town", "northing_UTM", "easting_UTM", 'weather_station_uspest', 'first_catch_spring_date',
'first_catch_spring_day', 'total_SWD_spring', 'total_SWD_winter', 'SWD_June'], axis = 1)
# Reorder columns
column_names_reordered = ['year', 'host', 'management', 'setting', 'lure', 'latitude', 'longitude',
'elevation_m', 'Tmin_winter', 'Tmax_winter', 'Tmin_spring', 'Tmax_spring',
'Tmin_summer', 'Tmax_summer', 'days_below_5_winter', 'days_below_0_winter',
'DD_winter', 'DD_spring', 'DD_summer', 'precipitation_winter',
'precipitation_spring', 'precipitation_summer', 'total_SWD_summer']
df = df[column_names_reordered]
Here we end up with 22 independent variables (related to the setting and weather variables), and one outcome variable ("total_SWD_summer") which is the total number of flies collected in a trap in each site during the summer solstice.
# Create a checkpoint
df1 = df.copy()
#Create dummy values for categorical variables, in some cases narrow down to only two variables to balance dataset.
# host = "cherry" or "other"
df1['host'] = df1['host'].map({'cherry':0, 'raspberry':1, 'blackberry':1, 'peach':1, 'blueberry':1}).astype("category", copy = False)
# management = "unmanaged" or "managed"
df1['management'] = df1['management'].map({'unmanaged': 0, 'managed': 1}).astype("category", copy = False)
# setting = "agricultural" or not
df1['setting'] = df1['setting'].map({'agricultural': 0, 'urban': 1, 'forest': 1}).astype("category", copy = False)
# lure = "Trece+ACV" or "CACV"
df1['lure'] = df1['lure'].map({'Trece+ACV': 0, 'CACV': 1}).astype("category", copy = False)
# Because the dataset is so small, instead of using counts as a continuous outcome variable,
# we will divide the trap sites into two categories, "low risk" and "high risk"
swd_median = df1['total_SWD_summer'].median()
# This is for creating two categories for targets instead of a continuous variable,
# depending on whether a trap count is above or below the median
targets = np.where(df1['total_SWD_summer'] > swd_median, 1, 0)
unscaled_inputs = df1.drop(['total_SWD_summer'], axis = 1) # Eliminate the output variable from the dataset
# Standardize all the numeric (integers or float) variables, returns data in form of numpy array
scaled_inputs = unscaled_inputs.copy()
for column in scaled_inputs:
if scaled_inputs[column].dtype == np.float64 or scaled_inputs[column].dtype == np.int64:
scaled_inputs[column] = preprocessing.scale(scaled_inputs[column].astype(np.float64, copy = False))
The data is all preprocessed here and we can move on to building and training a machine learning model
# Use a "config" dictionary to determine and modify the hyperparameters of the model
config = {
# For building the model
"input_size": len(scaled_inputs.columns), # count number of inputs
"output_size": 1, # binary choice 1 and 0
"hidden_layer_size": 64,
"dense_layer_activation": "relu",
"output_activation": "sigmoid",
# For compiling the model
"optimizer": "adam",
"loss": "binary_crossentropy",
"metrics": ["accuracy"],
# For fitting the model
"batch_size": 1,
"epochs": 10,
"validation_freq": 1,
# For callbacks
# This learning rate schedule adjusts a dynamic learning rate as the model trains
"learning_rate_schedule": [
{
"proportion": 0.33,
"learning_rate": 0.001
},
{
"proportion": 0.33,
"learning_rate": 0.0001
},
{
"proportion": 0.34,
"learning_rate": 0.00001
}
],
"tensorboard": True,
# For storing logs
"work_dir": "/content/drive/My Drive/models/swd_trap_counts"
}
# Create callbacks for model.fit
callbacks = []
if "learning_rate" in config:
# Define learning rate schedule
def learning_rate_schedule(epoch_index: int) -> float:
epoch_until = 0
for rule in config["learning_rate"]:
epoch_until += int(rule['proportion'] * config['epochs'])
if epoch_index < epoch_until:
return rule['learning_rate']
learning_rate_callback = tf.keras.callbacks.LearningRateScheduler(
learning_rate_schedule, verbose=1)
callbacks.append(learning_rate_callback)
if "tensorboard" in config:
# Writing logs to monitor training in Tensorboard
checkpoint_tensorboard = tf.keras.callbacks.TensorBoard(
log_dir=os.path.join(config["work_dir"], 'logs'),
)
callbacks.append(checkpoint_tensorboard)
def get_model():
# set global random seed for Tensorflow
tf.random.set_seed(1234)
# Build model
model = tf.keras.Sequential([
tf.keras.layers.Input(shape = config["input_size"]),
tf.keras.layers.Dense(
units = config["hidden_layer_size"],
activation = config["dense_layer_activation"]),
tf.keras.layers.Dense(
units = config["output_size"],
activation = config["output_activation"])
])
model.compile(
optimizer= config["optimizer"],
loss= config["loss"],
metrics= config["metrics"]
)
return model
# Create a tensorflow dataset
dataset = tf.data.Dataset.from_tensor_slices((scaled_inputs.values, targets))
dataset = dataset.shuffle(len(scaled_inputs)).batch(1)
# Because we have so little data, we perform k-fold crossvalidation training
# We split the dataset into "k" folds (parts). Then the model trains k-times,
# selecting different samples of the dataset as training and test data in each fold.
# 1. Add index. Each "row" now has an index 0, 1, 2, 3, ..., N
dataset = dataset.enumerate()
# 2. Define number of folds. In this case, the number of folds is the number of
# data items, i.e., we take out one data item and use the rest for training.
num_folds = len(scaled_inputs)
# 3. Go through each fold
for fold_index in range(num_folds):
# Split the data into training and testing according to the fold_index
train_list_ds = dataset.filter(lambda i, data: i % num_folds != fold_index)
test_list_ds = dataset.filter(lambda i, data: i % num_folds == fold_index)
# Remove the extra "index" column added so that we can more easily split
# data items into folds.
train_list_ds = train_list_ds.map(lambda i, data: data)
test_list_ds = test_list_ds.map(lambda i, data: data)
# Get a "fresh" model instance
model = get_model()
# Train the model, using the fold's test data as validation data
model.fit(
train_list_ds,
batch_size = config["batch_size"],
epochs = config["epochs"],
callbacks = callbacks,
validation_data = test_list_ds,
validation_freq = config["epochs"],
verbose = 0
)
# Load the TensorBoard notebook extension
%load_ext tensorboard
# Set up Tensorboard to plot accuracy and loss as the model trains
%tensorboard --logdir="/content/drive/My Drive/models/swd_trap_counts"
The graph shows the validation and loss of k-folds number of models, so it is a bit messy to read. Because there is only one validation sample, the score is 1 (when the model predicted the outcome correctly), or 0 (when the model predicted the outcome incorrectly). To get a better idea of the validation accuracy, one can add the validation accuracy score from all the folds and divide it by the total number of folds.
from tensorflow.python.summary.summary_iterator import summary_iterator
from pathlib import Path
validation_accuracy = []
for log_file_path in Path("/content/drive/My Drive/models/swd_trap_counts/logs/validation/").glob("*"):
for event in summary_iterator(str(log_file_path)):
for value in event.summary.value:
if value.tag == 'epoch_accuracy':
validation_accuracy.append(value.simple_value)
np.asarray(validation_accuracy).sum()/len(validation_accuracy)
With this particular dataset, we see that the training accuracy is only 0.59, not much better than chance. This means that, when given the environmental variables, the model can only correctly predict whether a site will be "high risk" or "low risk" 59% of the times. Either the predictors we chose are poor predictors to determine which sites will have high or low counts of SWD, or the dataset is too small to make a robust model.