Article 4

ML Gesture Recognition

Classify multi-electrode touch patterns using machine learning — from data collection on the Touch Board through to a trained model running in the browser.

⏱ 110 min machine learning gesture recognition TensorFlow ml5.js neural network classification

Rule-based gesture classifiers break when users deviate from expected behaviour. Machine learning classifiers generalise: a model trained on a few hundred examples of a gesture can recognise that gesture performed faster, slower, by different people, with varying electrode contact. This tutorial builds a complete ML pipeline — data collection from the Touch Board, feature engineering, model training in Python, and deployment in the browser with ml5.js.

System Overview

Touch Board → Arduino firmware (proximity stream)
    ↓
Python data collector (records labelled examples to CSV)
    ↓
Jupyter notebook (feature engineering + model training)
    ↓
Export: model.json + weights.bin (TensorFlow.js format)
    ↓
p5.js + ml5.js (real-time inference in browser)

The full round-trip from no data to a working browser classifier takes about two hours, including data collection time.

Data Collection Firmware

The firmware streams all 12 proximity values at 50 Hz, plus touch/release events. Add a gesture start/end protocol so the data collector can label windows automatically:

#include <MPR121.h>

unsigned long lastUpdate = 0;
const int UPDATE_MS = 20; // 50 Hz

void setup() {
  Serial.begin(115200);
  MPR121.begin(0x5C);
  MPR121.setTouchThreshold(40);
  MPR121.setReleaseThreshold(20);
}

void loop() {
  // Touch events
  if (MPR121.touchStatusChanged()) {
    MPR121.updateTouchData();
    for (int i = 0; i < 12; i++) {
      if (MPR121.isNewTouch(i))   { Serial.print("T:"); Serial.println(i); }
      if (MPR121.isNewRelease(i)) { Serial.print("R:"); Serial.println(i); }
    }
  }

  // Proximity stream
  if (millis() - lastUpdate >= UPDATE_MS) {
    Serial.print("D:");
    for (int i = 0; i < 12; i++) {
      int prox = max(0, MPR121.getBaselineData(i) - MPR121.getFilteredData(i));
      Serial.print(prox);
      if (i < 11) Serial.print(',');
    }
    Serial.println();
    lastUpdate = millis();
  }
}

Python Data Collector

The collector records labelled examples. Run it from the terminal; press a key to start/stop recording each gesture.

#!/usr/bin/env python3
"""Touch Board gesture data collector.

Usage:
  python3 collect.py /dev/ttyACM0 --label swipe_right
  python3 collect.py /dev/ttyACM0 --label tap
"""

import serial, csv, time, argparse, sys, termios, tty
from datetime import datetime

BAUD    = 115200
COLUMNS = [f'e{i}' for i in range(12)]

def get_key():
    fd  = sys.stdin.fileno()
    old = termios.tcgetattr(fd)
    try:
        tty.setraw(fd)
        return sys.stdin.read(1)
    finally:
        termios.tcsetattr(fd, termios.TCSADRAIN, old)

def collect(port_path, label, output_csv='gestures.csv'):
    write_header = True
    try:
        import os
        write_header = not os.path.exists(output_csv)
    except: pass

    ser = serial.Serial(port_path, BAUD, timeout=1)
    time.sleep(1)
    ser.reset_input_buffer()

    print(f'Collecting label: "{label}"')
    print('Press SPACE to start/stop a recording, Q to quit.\n')

    examples = []
    recording = False
    current   = []

    with open(output_csv, 'a', newline='') as f:
        writer = csv.DictWriter(f, fieldnames=['label','timestamp'] + COLUMNS)
        if write_header:
            writer.writeheader()

        buf = ''
        while True:
            # Non-blocking key check
            import select
            r, _, _ = select.select([sys.stdin], [], [], 0)
            if r:
                k = sys.stdin.read(1)
                if k == ' ':
                    if not recording:
                        recording = True
                        current   = []
                        print('  [Recording…]')
                    else:
                        recording = False
                        if len(current) >= 5:
                            examples.append(current)
                            # Write all frames of this gesture as rows
                            for frame in current:
                                row = {'label': label,
                                       'timestamp': frame['t']}
                                row.update({f'e{i}': frame['prox'][i]
                                            for i in range(12)})
                                writer.writerow(row)
                            print(f'  [Saved {len(current)} frames] ({len(examples)} examples total)')
                        else:
                            print('  [Too short — discarded]')
                        current = []
                elif k in ('q', 'Q'):
                    break

            # Read serial
            chunk = ser.read(ser.in_waiting or 1).decode('ascii', errors='ignore')
            buf  += chunk
            while '\n' in buf:
                line, buf = buf.split('\n', 1)
                line = line.strip()
                if line.startswith('D:') and recording:
                    parts = line[2:].split(',')
                    prox  = [int(p) for p in parts if p.strip().isdigit()]
                    if len(prox) == 12:
                        current.append({'t': time.time(), 'prox': prox})

    print(f'\nDone. {len(examples)} gestures saved to {output_csv}')
    ser.close()

if __name__ == '__main__':
    ap = argparse.ArgumentParser()
    ap.add_argument('port')
    ap.add_argument('--label', required=True)
    ap.add_argument('--output', default='gestures.csv')
    args = ap.parse_args()
    collect(args.port, args.label, args.output)

Collect at least 50 examples per gesture class. More is better, but 100–200 per class is usually sufficient for this feature space.

Feature Engineering

Raw proximity frames are time series. We need to extract fixed-length feature vectors for a feed-forward classifier. Alternative: use an LSTM on the raw sequence — covered at the end of this tutorial.

Segmentation

A gesture is a temporal window bounded by hand-near and hand-far events. In the collected data, segment each example by grouping consecutive frames where any electrode’s proximity exceeds a threshold.

Feature Vector Design

For each segmented gesture, compute a feature vector:

import numpy as np
import pandas as pd

def extract_features(frames: np.ndarray) -> np.ndarray:
    """
    frames: (T, 12) array of proximity values.
    Returns 1D feature vector.
    """
    features = []

    # Statistical features per electrode
    features.extend(frames.mean(axis=0))        # 12 values
    features.extend(frames.std(axis=0))         # 12 values
    features.extend(frames.max(axis=0))         # 12 values
    features.extend(frames.min(axis=0))         # 12 values

    # Centroid trajectory
    weights = frames.sum(axis=1) + 1e-6         # (T,)
    centroid = (frames * np.arange(12)).sum(axis=1) / weights  # (T,)
    features.append(centroid.mean())            # mean position
    features.append(centroid.std())             # position variance
    features.append(centroid[-1] - centroid[0]) # total displacement
    if len(centroid) > 1:
        velocity = np.diff(centroid)
        features.append(velocity.mean())        # mean velocity
        features.append(velocity.max())         # max velocity
    else:
        features.extend([0.0, 0.0])

    # Total energy
    features.append(frames.sum(axis=1).mean())  # mean total proximity
    features.append(frames.sum(axis=1).max())   # peak total proximity

    # Duration (as frame count — normalise by expected 50 Hz)
    features.append(len(frames) / 50.0)         # gesture duration in seconds

    return np.array(features, dtype=np.float32) # 12*4 + 9 = 57 features

Preparing the Dataset

import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split

df = pd.read_csv('gestures.csv')

# Group frames into gesture examples (by timestamp gaps)
FRAME_MS = 20 / 1000  # 50 Hz

examples, labels = [], []

for label, group in df.groupby('label'):
    group = group.sort_values('timestamp')
    timestamps = group['timestamp'].values
    proximity  = group[[f'e{i}' for i in range(12)]].values

    # Split into segments where gap > 200ms indicates a new gesture
    gaps = np.diff(timestamps)
    split_points = np.where(gaps > 0.2)[0] + 1
    segments = np.split(np.arange(len(timestamps)), split_points)

    for seg_indices in segments:
        if len(seg_indices) < 5: continue
        frames  = proximity[seg_indices]
        features = extract_features(frames)
        examples.append(features)
        labels.append(label)

X = np.array(examples)
le = LabelEncoder()
y  = le.fit_transform(labels)

print(f'{len(X)} examples, {len(le.classes_)} classes: {le.classes_}')

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42, stratify=y)

Model Training with TensorFlow/Keras

import tensorflow as tf
from tensorflow import keras
import numpy as np

NUM_CLASSES = len(le.classes_)
INPUT_DIM   = X_train.shape[1]  # 57

model = keras.Sequential([
    keras.layers.Dense(128, activation='relu', input_shape=(INPUT_DIM,)),
    keras.layers.BatchNormalization(),
    keras.layers.Dropout(0.3),
    keras.layers.Dense(64, activation='relu'),
    keras.layers.BatchNormalization(),
    keras.layers.Dropout(0.3),
    keras.layers.Dense(32, activation='relu'),
    keras.layers.Dense(NUM_CLASSES, activation='softmax')
])

model.compile(
    optimizer='adam',
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

# Early stopping to prevent overfitting
early_stop = keras.callbacks.EarlyStopping(
    monitor='val_accuracy', patience=20, restore_best_weights=True)

history = model.fit(
    X_train, y_train,
    validation_data=(X_test, y_test),
    epochs=200,
    batch_size=16,
    callbacks=[early_stop],
    verbose=1
)

# Evaluate
loss, acc = model.evaluate(X_test, y_test)
print(f'Test accuracy: {acc:.3f}')

# Confusion matrix
from sklearn.metrics import classification_report
y_pred = model.predict(X_test).argmax(axis=1)
print(classification_report(y_test, y_pred, target_names=le.classes_))

LSTM for Raw Sequence Classification

If the feature-engineering approach gives insufficient accuracy (especially for gestures that differ mainly in speed or rhythm), train an LSTM on padded frame sequences:

from tensorflow.keras.preprocessing.sequence import pad_sequences

MAX_LEN = 75  # ~1.5 seconds at 50 Hz

# Build (raw) sequence dataset
X_seq, y_seq = [], []
for label, group in df.groupby('label'):
    # ... (same segmentation as above)
    for seg_indices in segments:
        frames = proximity[seg_indices][:MAX_LEN]  # truncate long gestures
        X_seq.append(frames)
        y_seq.append(le.transform([label])[0])

X_pad = pad_sequences(X_seq, maxlen=MAX_LEN, dtype='float32',
                       padding='post', truncating='post')
y_seq = np.array(y_seq)

lstm_model = keras.Sequential([
    keras.layers.Masking(mask_value=0.0, input_shape=(MAX_LEN, 12)),
    keras.layers.LSTM(64, return_sequences=True),
    keras.layers.LSTM(32),
    keras.layers.Dense(32, activation='relu'),
    keras.layers.Dropout(0.3),
    keras.layers.Dense(NUM_CLASSES, activation='softmax')
])

lstm_model.compile(optimizer='adam',
                    loss='sparse_categorical_crossentropy',
                    metrics=['accuracy'])
lstm_model.fit(X_pad, y_seq, validation_split=0.2, epochs=100,
               callbacks=[early_stop])

Exporting to TensorFlow.js

import tensorflowjs as tfjs

# Export the feature-based model (or LSTM model — same syntax)
tfjs.converters.save_keras_model(model, 'tfjs_model/')

# Also save the scaler parameters for use in JS
import json
scaler_params = {
    'mean': scaler.mean_.tolist(),
    'std':  scaler.scale_.tolist(),
    'classes': le.classes_.tolist()
}
with open('tfjs_model/scaler.json', 'w') as f:
    json.dump(scaler_params, f)

The output directory tfjs_model/ contains model.json and one or more group1-shard*.bin files. Serve these from your Jekyll assets/ folder.

Deployment in p5.js with ml5.js

<!-- In your HTML template or Jekyll layout -->
<script src="https://unpkg.com/ml5@latest/dist/ml5.min.js"></script>

let classifier, scalerParams;
let proxValues   = new Array(12).fill(0);
let gestureBuffer = []; // accumulates frames during a gesture
let isGesturing   = false;
let lastResult    = '';

async function preload() {
  // Load scaler parameters
  scalerParams = await fetch('/assets/models/tfjs_model/scaler.json')
                         .then(r => r.json());
  // Load TensorFlow.js model via ml5
  classifier = await ml5.neuralNetwork({
    task: 'classification',
    debug: false
  });
  await classifier.load('/assets/models/tfjs_model/model.json', scalerParams.classes);
  console.log('Model loaded');
}

function extractFeatures(frames) {
  // Must match the Python extract_features() function
  let F = [];
  let T = frames.length;
  let arr = frames; // T × 12

  // Mean, std, max, min per electrode
  for (let stat of ['mean','std','max','min']) {
    for (let e = 0; e < 12; e++) {
      let col = arr.map(row => row[e]);
      if (stat === 'mean') F.push(col.reduce((a,b)=>a+b,0)/T);
      else if (stat === 'std') {
        let m = col.reduce((a,b)=>a+b,0)/T;
        F.push(Math.sqrt(col.map(v=>(v-m)**2).reduce((a,b)=>a+b,0)/T));
      }
      else if (stat === 'max') F.push(Math.max(...col));
      else F.push(Math.min(...col));
    }
  }

  // Centroid trajectory
  let centroids = arr.map(row => {
    let w = row.reduce((a,b)=>a+b,0)+1e-6;
    return row.reduce((s,v,i)=>s+i*v,0)/w;
  });
  let cm = centroids.reduce((a,b)=>a+b,0)/T;
  let cs = Math.sqrt(centroids.map(v=>(v-cm)**2).reduce((a,b)=>a+b,0)/T);
  let cd = centroids[T-1]-centroids[0];
  let mv=0, maxv=0;
  if(T>1){let vel=centroids.slice(1).map((v,i)=>v-centroids[i]);mv=vel.reduce((a,b)=>a+b,0)/(T-1);maxv=Math.max(...vel);}
  F.push(cm,cs,cd,mv,maxv);

  // Energy
  let energy = arr.map(row=>row.reduce((a,b)=>a+b,0));
  F.push(energy.reduce((a,b)=>a+b,0)/T);
  F.push(Math.max(...energy));
  F.push(T/50);

  return F;
}

function scaleFeatures(features) {
  return features.map((v,i) => (v - scalerParams.mean[i]) / scalerParams.std[i]);
}

async function classifyGesture(frames) {
  let features = extractFeatures(frames);
  let scaled   = scaleFeatures(features);
  // ml5 neuralNetwork classify
  let results  = await classifier.classify([scaled]);
  lastResult   = results[0].label;
  let conf     = results[0].confidence;
  console.log(`Gesture: ${lastResult} (${(conf*100).toFixed(1)}%)`);
  if (conf > 0.7) onGestureDetected(lastResult);
}

function parseLine(line) {
  line = line.trim();
  if (line.startsWith('D:')) {
    let parts = line.slice(2).split(',').map(Number);
    let totalProx = parts.reduce((a,b)=>a+b,0);
    proxValues = parts;

    if (totalProx > 80 && !isGesturing) {
      isGesturing = true;
      gestureBuffer = [];
    }
    if (isGesturing) {
      gestureBuffer.push([...proxValues]);
      if (totalProx < 20) {
        isGesturing = false;
        if (gestureBuffer.length >= 5) classifyGesture(gestureBuffer);
        gestureBuffer = [];
      }
    }
  }
}

function onGestureDetected(label) {
  // Trigger visual / audio response
  console.log('Detected:', label);
}

Improving Accuracy

More data: 200+ examples per class is the single most effective improvement
Data augmentation: add Gaussian noise to existing frames, scale proximity values by ±20%, time-stretch or compress sequences
Cross-validation: use 5-fold CV instead of a single train/test split to get a reliable accuracy estimate
Confusion matrix analysis: identify which classes are confused and either add more training data or redesign gestures to be more distinct
Confidence thresholding: only act on classifications with confidence > 70% — discard the rest as “unknown”
Ensemble: train three models with different random seeds and average their output probabilities before taking argmax

Next Steps

Use transfer learning: train a base model on a large gesture dataset, then fine-tune on your specific gestures
Implement on-device training: allow performers to record new gestures live and update the model without a laptop
Explore contrastive learning to handle few-shot gesture recognition with only 5–10 examples per class
Deploy the LSTM model to a Raspberry Pi using TensorFlow Lite for offline, no-browser inference