This dataset comes from the UCI Machine-Learning repository. Please first read the dataset description to understand what is the data.
The goal on this dataset is first to perform clustering on UNLABELLED data, and to learn a classifier on LABELLED. Finally, analysis should be conducted to check whether or not the clustering could be used for classification (i.e. does the clusters properly map on classes, possibly with more than 1 cluster per class?).
Each example is a vector of dimension 561. And there are 7352 training examples, and 2947 testing examples.
import numpy as np
import pandas as pd
import matplotlib.pylab as plt
import sklearn
# Import the HAR dataset
x_train_file = open('train/X_train.txt', 'r')
y_train_file = open('train/y_train.txt', 'r')
x_test_file = open('test/X_test.txt', 'r')
y_test_file = open('test/y_test.txt', 'r')
# Create empty lists
x_train = []
y_train = []
x_test = []
y_test = []
# Mapping table for classes
labels = {1:'WALKING', 2:'WALKING UPSTAIRS', 3:'WALKING DOWNSTAIRS',
4:'SITTING', 5:'STANDING', 6:'LAYING'}
# Loop through datasets
for x in x_train_file:
x_train.append([float(ts) for ts in x.split()])
for y in y_train_file:
y_train.append(int(y.rstrip('\n')))
for x in x_test_file:
x_test.append([float(ts) for ts in x.split()])
for y in y_test_file:
y_test.append(int(y.rstrip('\n')))
# Convert to numpy for efficiency
x_train = np.array(x_train)
y_train = np.array(y_train)
x_test = np.array(x_test)
y_test = np.array(y_test)
# for the utilisation of binary classifier or neural network we also require the class encoded in forms of one-hot vectors
from sklearn.preprocessing import LabelBinarizer
from subprocess import check_output
binarizer = LabelBinarizer().fit(y_train)
y_train_onehot = binarizer.transform(y_train)
y_test_onehot = binarizer.transform(y_test)
print( x_train.shape )
print( x_test.shape )