ML Gesture Recognition
Classify multi-electrode touch patterns using machine learning — from data collection on the Touch Board through to a trained model running in the browser.
Rule-based gesture classifiers break when users deviate from expected behaviour. Machine learning classifiers generalise: a model trained on a few hundred examples of a gesture can recognise that gesture performed faster, slower, by different people, with varying electrode contact. This tutorial builds a complete ML pipeline — data collection from the Touch Board, feature engineering, model training in Python, and deployment in the browser with ml5.js.
System Overview
Touch Board → Arduino firmware (proximity stream)
↓
Python data collector (records labelled examples to CSV)
↓
Jupyter notebook (feature engineering + model training)
↓
Export: model.json + weights.bin (TensorFlow.js format)
↓
p5.js + ml5.js (real-time inference in browser)
The full round-trip from no data to a working browser classifier takes about two hours, including data collection time.
Data Collection Firmware
The firmware streams all 12 proximity values at 50 Hz, plus touch/release events. Add a gesture start/end protocol so the data collector can label windows automatically:
#include <MPR121.h>
unsigned long lastUpdate = 0;
const int UPDATE_MS = 20; // 50 Hz
void setup() {
Serial.begin(115200);
MPR121.begin(0x5C);
MPR121.setTouchThreshold(40);
MPR121.setReleaseThreshold(20);
}
void loop() {
// Touch events
if (MPR121.touchStatusChanged()) {
MPR121.updateTouchData();
for (int i = 0; i < 12; i++) {
if (MPR121.isNewTouch(i)) { Serial.print("T:"); Serial.println(i); }
if (MPR121.isNewRelease(i)) { Serial.print("R:"); Serial.println(i); }
}
}
// Proximity stream
if (millis() - lastUpdate >= UPDATE_MS) {
Serial.print("D:");
for (int i = 0; i < 12; i++) {
int prox = max(0, MPR121.getBaselineData(i) - MPR121.getFilteredData(i));
Serial.print(prox);
if (i < 11) Serial.print(',');
}
Serial.println();
lastUpdate = millis();
}
}
Python Data Collector
The collector records labelled examples. Run it from the terminal; press a key to start/stop recording each gesture.
#!/usr/bin/env python3
"""Touch Board gesture data collector.
Usage:
python3 collect.py /dev/ttyACM0 --label swipe_right
python3 collect.py /dev/ttyACM0 --label tap
"""
import serial, csv, time, argparse, sys, termios, tty
from datetime import datetime
BAUD = 115200
COLUMNS = [f'e{i}' for i in range(12)]
def get_key():
fd = sys.stdin.fileno()
old = termios.tcgetattr(fd)
try:
tty.setraw(fd)
return sys.stdin.read(1)
finally:
termios.tcsetattr(fd, termios.TCSADRAIN, old)
def collect(port_path, label, output_csv='gestures.csv'):
write_header = True
try:
import os
write_header = not os.path.exists(output_csv)
except: pass
ser = serial.Serial(port_path, BAUD, timeout=1)
time.sleep(1)
ser.reset_input_buffer()
print(f'Collecting label: "{label}"')
print('Press SPACE to start/stop a recording, Q to quit.\n')
examples = []
recording = False
current = []
with open(output_csv, 'a', newline='') as f:
writer = csv.DictWriter(f, fieldnames=['label','timestamp'] + COLUMNS)
if write_header:
writer.writeheader()
buf = ''
while True:
# Non-blocking key check
import select
r, _, _ = select.select([sys.stdin], [], [], 0)
if r:
k = sys.stdin.read(1)
if k == ' ':
if not recording:
recording = True
current = []
print(' [Recording…]')
else:
recording = False
if len(current) >= 5:
examples.append(current)
# Write all frames of this gesture as rows
for frame in current:
row = {'label': label,
'timestamp': frame['t']}
row.update({f'e{i}': frame['prox'][i]
for i in range(12)})
writer.writerow(row)
print(f' [Saved {len(current)} frames] ({len(examples)} examples total)')
else:
print(' [Too short — discarded]')
current = []
elif k in ('q', 'Q'):
break
# Read serial
chunk = ser.read(ser.in_waiting or 1).decode('ascii', errors='ignore')
buf += chunk
while '\n' in buf:
line, buf = buf.split('\n', 1)
line = line.strip()
if line.startswith('D:') and recording:
parts = line[2:].split(',')
prox = [int(p) for p in parts if p.strip().isdigit()]
if len(prox) == 12:
current.append({'t': time.time(), 'prox': prox})
print(f'\nDone. {len(examples)} gestures saved to {output_csv}')
ser.close()
if __name__ == '__main__':
ap = argparse.ArgumentParser()
ap.add_argument('port')
ap.add_argument('--label', required=True)
ap.add_argument('--output', default='gestures.csv')
args = ap.parse_args()
collect(args.port, args.label, args.output)
Collect at least 50 examples per gesture class. More is better, but 100–200 per class is usually sufficient for this feature space.
Feature Engineering
Raw proximity frames are time series. We need to extract fixed-length feature vectors for a feed-forward classifier. Alternative: use an LSTM on the raw sequence — covered at the end of this tutorial.
Segmentation
A gesture is a temporal window bounded by hand-near and hand-far events. In the collected data, segment each example by grouping consecutive frames where any electrode’s proximity exceeds a threshold.
Feature Vector Design
For each segmented gesture, compute a feature vector:
import numpy as np
import pandas as pd
def extract_features(frames: np.ndarray) -> np.ndarray:
"""
frames: (T, 12) array of proximity values.
Returns 1D feature vector.
"""
features = []
# Statistical features per electrode
features.extend(frames.mean(axis=0)) # 12 values
features.extend(frames.std(axis=0)) # 12 values
features.extend(frames.max(axis=0)) # 12 values
features.extend(frames.min(axis=0)) # 12 values
# Centroid trajectory
weights = frames.sum(axis=1) + 1e-6 # (T,)
centroid = (frames * np.arange(12)).sum(axis=1) / weights # (T,)
features.append(centroid.mean()) # mean position
features.append(centroid.std()) # position variance
features.append(centroid[-1] - centroid[0]) # total displacement
if len(centroid) > 1:
velocity = np.diff(centroid)
features.append(velocity.mean()) # mean velocity
features.append(velocity.max()) # max velocity
else:
features.extend([0.0, 0.0])
# Total energy
features.append(frames.sum(axis=1).mean()) # mean total proximity
features.append(frames.sum(axis=1).max()) # peak total proximity
# Duration (as frame count — normalise by expected 50 Hz)
features.append(len(frames) / 50.0) # gesture duration in seconds
return np.array(features, dtype=np.float32) # 12*4 + 9 = 57 features
Preparing the Dataset
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
df = pd.read_csv('gestures.csv')
# Group frames into gesture examples (by timestamp gaps)
FRAME_MS = 20 / 1000 # 50 Hz
examples, labels = [], []
for label, group in df.groupby('label'):
group = group.sort_values('timestamp')
timestamps = group['timestamp'].values
proximity = group[[f'e{i}' for i in range(12)]].values
# Split into segments where gap > 200ms indicates a new gesture
gaps = np.diff(timestamps)
split_points = np.where(gaps > 0.2)[0] + 1
segments = np.split(np.arange(len(timestamps)), split_points)
for seg_indices in segments:
if len(seg_indices) < 5: continue
frames = proximity[seg_indices]
features = extract_features(frames)
examples.append(features)
labels.append(label)
X = np.array(examples)
le = LabelEncoder()
y = le.fit_transform(labels)
print(f'{len(X)} examples, {len(le.classes_)} classes: {le.classes_}')
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(
X_scaled, y, test_size=0.2, random_state=42, stratify=y)
Model Training with TensorFlow/Keras
import tensorflow as tf
from tensorflow import keras
import numpy as np
NUM_CLASSES = len(le.classes_)
INPUT_DIM = X_train.shape[1] # 57
model = keras.Sequential([
keras.layers.Dense(128, activation='relu', input_shape=(INPUT_DIM,)),
keras.layers.BatchNormalization(),
keras.layers.Dropout(0.3),
keras.layers.Dense(64, activation='relu'),
keras.layers.BatchNormalization(),
keras.layers.Dropout(0.3),
keras.layers.Dense(32, activation='relu'),
keras.layers.Dense(NUM_CLASSES, activation='softmax')
])
model.compile(
optimizer='adam',
loss='sparse_categorical_crossentropy',
metrics=['accuracy']
)
# Early stopping to prevent overfitting
early_stop = keras.callbacks.EarlyStopping(
monitor='val_accuracy', patience=20, restore_best_weights=True)
history = model.fit(
X_train, y_train,
validation_data=(X_test, y_test),
epochs=200,
batch_size=16,
callbacks=[early_stop],
verbose=1
)
# Evaluate
loss, acc = model.evaluate(X_test, y_test)
print(f'Test accuracy: {acc:.3f}')
# Confusion matrix
from sklearn.metrics import classification_report
y_pred = model.predict(X_test).argmax(axis=1)
print(classification_report(y_test, y_pred, target_names=le.classes_))
LSTM for Raw Sequence Classification
If the feature-engineering approach gives insufficient accuracy (especially for gestures that differ mainly in speed or rhythm), train an LSTM on padded frame sequences:
from tensorflow.keras.preprocessing.sequence import pad_sequences
MAX_LEN = 75 # ~1.5 seconds at 50 Hz
# Build (raw) sequence dataset
X_seq, y_seq = [], []
for label, group in df.groupby('label'):
# ... (same segmentation as above)
for seg_indices in segments:
frames = proximity[seg_indices][:MAX_LEN] # truncate long gestures
X_seq.append(frames)
y_seq.append(le.transform([label])[0])
X_pad = pad_sequences(X_seq, maxlen=MAX_LEN, dtype='float32',
padding='post', truncating='post')
y_seq = np.array(y_seq)
lstm_model = keras.Sequential([
keras.layers.Masking(mask_value=0.0, input_shape=(MAX_LEN, 12)),
keras.layers.LSTM(64, return_sequences=True),
keras.layers.LSTM(32),
keras.layers.Dense(32, activation='relu'),
keras.layers.Dropout(0.3),
keras.layers.Dense(NUM_CLASSES, activation='softmax')
])
lstm_model.compile(optimizer='adam',
loss='sparse_categorical_crossentropy',
metrics=['accuracy'])
lstm_model.fit(X_pad, y_seq, validation_split=0.2, epochs=100,
callbacks=[early_stop])
Exporting to TensorFlow.js
import tensorflowjs as tfjs
# Export the feature-based model (or LSTM model — same syntax)
tfjs.converters.save_keras_model(model, 'tfjs_model/')
# Also save the scaler parameters for use in JS
import json
scaler_params = {
'mean': scaler.mean_.tolist(),
'std': scaler.scale_.tolist(),
'classes': le.classes_.tolist()
}
with open('tfjs_model/scaler.json', 'w') as f:
json.dump(scaler_params, f)
The output directory tfjs_model/ contains model.json and one or more group1-shard*.bin files. Serve these from your Jekyll assets/ folder.
Deployment in p5.js with ml5.js
<!-- In your HTML template or Jekyll layout -->
<script src="https://unpkg.com/ml5@latest/dist/ml5.min.js"></script>
let classifier, scalerParams;
let proxValues = new Array(12).fill(0);
let gestureBuffer = []; // accumulates frames during a gesture
let isGesturing = false;
let lastResult = '';
async function preload() {
// Load scaler parameters
scalerParams = await fetch('/assets/models/tfjs_model/scaler.json')
.then(r => r.json());
// Load TensorFlow.js model via ml5
classifier = await ml5.neuralNetwork({
task: 'classification',
debug: false
});
await classifier.load('/assets/models/tfjs_model/model.json', scalerParams.classes);
console.log('Model loaded');
}
function extractFeatures(frames) {
// Must match the Python extract_features() function
let F = [];
let T = frames.length;
let arr = frames; // T × 12
// Mean, std, max, min per electrode
for (let stat of ['mean','std','max','min']) {
for (let e = 0; e < 12; e++) {
let col = arr.map(row => row[e]);
if (stat === 'mean') F.push(col.reduce((a,b)=>a+b,0)/T);
else if (stat === 'std') {
let m = col.reduce((a,b)=>a+b,0)/T;
F.push(Math.sqrt(col.map(v=>(v-m)**2).reduce((a,b)=>a+b,0)/T));
}
else if (stat === 'max') F.push(Math.max(...col));
else F.push(Math.min(...col));
}
}
// Centroid trajectory
let centroids = arr.map(row => {
let w = row.reduce((a,b)=>a+b,0)+1e-6;
return row.reduce((s,v,i)=>s+i*v,0)/w;
});
let cm = centroids.reduce((a,b)=>a+b,0)/T;
let cs = Math.sqrt(centroids.map(v=>(v-cm)**2).reduce((a,b)=>a+b,0)/T);
let cd = centroids[T-1]-centroids[0];
let mv=0, maxv=0;
if(T>1){let vel=centroids.slice(1).map((v,i)=>v-centroids[i]);mv=vel.reduce((a,b)=>a+b,0)/(T-1);maxv=Math.max(...vel);}
F.push(cm,cs,cd,mv,maxv);
// Energy
let energy = arr.map(row=>row.reduce((a,b)=>a+b,0));
F.push(energy.reduce((a,b)=>a+b,0)/T);
F.push(Math.max(...energy));
F.push(T/50);
return F;
}
function scaleFeatures(features) {
return features.map((v,i) => (v - scalerParams.mean[i]) / scalerParams.std[i]);
}
async function classifyGesture(frames) {
let features = extractFeatures(frames);
let scaled = scaleFeatures(features);
// ml5 neuralNetwork classify
let results = await classifier.classify([scaled]);
lastResult = results[0].label;
let conf = results[0].confidence;
console.log(`Gesture: ${lastResult} (${(conf*100).toFixed(1)}%)`);
if (conf > 0.7) onGestureDetected(lastResult);
}
function parseLine(line) {
line = line.trim();
if (line.startsWith('D:')) {
let parts = line.slice(2).split(',').map(Number);
let totalProx = parts.reduce((a,b)=>a+b,0);
proxValues = parts;
if (totalProx > 80 && !isGesturing) {
isGesturing = true;
gestureBuffer = [];
}
if (isGesturing) {
gestureBuffer.push([...proxValues]);
if (totalProx < 20) {
isGesturing = false;
if (gestureBuffer.length >= 5) classifyGesture(gestureBuffer);
gestureBuffer = [];
}
}
}
}
function onGestureDetected(label) {
// Trigger visual / audio response
console.log('Detected:', label);
}
Improving Accuracy
- More data: 200+ examples per class is the single most effective improvement
- Data augmentation: add Gaussian noise to existing frames, scale proximity values by ±20%, time-stretch or compress sequences
- Cross-validation: use 5-fold CV instead of a single train/test split to get a reliable accuracy estimate
- Confusion matrix analysis: identify which classes are confused and either add more training data or redesign gestures to be more distinct
- Confidence thresholding: only act on classifications with confidence > 70% — discard the rest as “unknown”
- Ensemble: train three models with different random seeds and average their output probabilities before taking argmax
Next Steps
- Use transfer learning: train a base model on a large gesture dataset, then fine-tune on your specific gestures
- Implement on-device training: allow performers to record new gestures live and update the model without a laptop
- Explore contrastive learning to handle few-shot gesture recognition with only 5–10 examples per class
- Deploy the LSTM model to a Raspberry Pi using TensorFlow Lite for offline, no-browser inference