p5.js Expert Article 2

Machine Learning with ml5.js

Use pre-trained models for pose estimation, hand tracking, image classification, and body segmentation — all in the browser.

⏱ 28 min read ml5.js machine learning PoseNet Handpose TensorFlow.js pose estimation

What is ml5.js?

ml5.js is a high-level library built on top of TensorFlow.js. It wraps pre-trained models — pose detection, object detection, image classification, text generation, and more — in a p5.js-friendly API. No ML expertise required.

<script src="https://cdn.jsdelivr.net/npm/ml5@1/dist/ml5.min.js"></script>

PoseNet — full-body pose estimation

PoseNet detects 17 body keypoints from a webcam feed in real-time:

let cam, pose, detector;

async function setup() {
  createCanvas(640, 480);

  cam = createCapture(VIDEO);
  cam.size(640, 480);
  cam.hide();

  // Load the MoveNet model (faster than PoseNet classic)
  detector = await ml5.poseDetection('MoveNet', {
    modelType: 'SINGLEPOSE_THUNDER'  // or SINGLEPOSE_LIGHTNING (faster)
  });

  detectPose();
}

async function detectPose() {
  while (true) {
    let results = await detector.detect(cam.elt);
    if (results.length > 0) pose = results[0];
    await new Promise(r => setTimeout(r, 0));  // yield to browser
  }
}

function draw() {
  image(cam, 0, 0);

  if (!pose) return;

  // Draw keypoints
  for (let kp of pose.keypoints) {
    if (kp.score < 0.3) continue;
    fill(100, 255, 150);
    noStroke();
    circle(kp.x, kp.y, 10);
  }

  // Draw skeleton
  let connections = ml5.POSE_CONNECTIONS || poseConnections();
  stroke(255, 100, 50);
  strokeWeight(2);
  for (let [a, b] of connections) {
    let kpA = pose.keypoints[a];
    let kpB = pose.keypoints[b];
    if (kpA.score > 0.3 && kpB.score > 0.3) {
      line(kpA.x, kpA.y, kpB.x, kpB.y);
    }
  }
}

HandPose — hand landmark detection

21 keypoints per hand:

let handDetector, hands = [];

async function setup() {
  createCanvas(640, 480);
  cam = createCapture(VIDEO);
  cam.size(640, 480);
  cam.hide();

  handDetector = await ml5.handPose({ maxHands: 2, flipped: false });
  handDetector.detectStart(cam, results => hands = results);
}

function draw() {
  image(cam, 0, 0);

  for (let hand of hands) {
    // 21 keypoints
    for (let kp of hand.keypoints) {
      fill(200, 100, 255);
      noStroke();
      circle(kp.x, kp.y, 8);
    }

    // Connections
    for (let [a, b] of hand.connections) {
      stroke(255, 255, 0);
      strokeWeight(1.5);
      let kpA = hand.keypoints[a], kpB = hand.keypoints[b];
      line(kpA.x, kpA.y, kpB.x, kpB.y);
    }

    // Pinch gesture detection
    let thumb = hand.keypoints[4];
    let index = hand.keypoints[8];
    let pinchDist = dist(thumb.x, thumb.y, index.x, index.y);

    if (pinchDist < 30) {
      fill(255, 255, 0, 150);
      noStroke();
      circle((thumb.x + index.x) / 2, (thumb.y + index.y) / 2, 30);
    }
  }
}

Image classification

let classifier, label = 'Loading...', confidence = 0;
let cam;

function setup() {
  createCanvas(640, 520);
  cam = createCapture(VIDEO);
  cam.size(640, 480);
  cam.hide();

  classifier = ml5.imageClassifier('MobileNet', () => {
    classifyFrame();
  });
}

function classifyFrame() {
  classifier.classify(cam, (error, results) => {
    if (error) { console.error(error); return; }
    label      = results[0].label;
    confidence = results[0].confidence;
    classifyFrame();  // continuous classification
  });
}

function draw() {
  image(cam, 0, 0);

  fill(0, 0, 0, 160);
  noStroke();
  rect(0, height - 50, width, 50);

  fill(255);
  textSize(18);
  textAlign(CENTER, CENTER);
  text(`${label} (${nf(confidence * 100, 1, 1)}%)`, width / 2, height - 20);
}

Body segmentation — remove background

let bodySegmentation, segmentation;
let cam;

async function setup() {
  createCanvas(640, 480);
  cam = createCapture(VIDEO);
  cam.size(640, 480);
  cam.hide();

  bodySegmentation = ml5.bodySegmentation('SelfieSegmentation', {
    maskType: 'person'
  });

  await bodySegmentation.ready;
  bodySegmentation.detectStart(cam, result => segmentation = result);
}

function draw() {
  background(20, 80, 150);  // replacement background

  if (segmentation && segmentation.mask) {
    // Draw mask (white = person, black = background)
    image(segmentation.mask, 0, 0, width, height);
    // Use blending to composite person over background
    // (full compositing requires drawingBuffer techniques or a shader)
  }
  image(cam, 0, 0);
}

Custom classifier — teachable machine style

Train a simple image classifier on your own categories using a webcam:

let featureExtractor, classifier;
let isTraining = false;

function setup() {
  createCanvas(640, 520);
  cam = createCapture(VIDEO);
  cam.size(640, 480);
  cam.hide();

  featureExtractor = ml5.featureExtractor('MobileNet', () => {
    classifier = featureExtractor.classification(cam);
  });
}

function addExample(label) {
  classifier.addImage(label);
  print(`Added example for: ${label}`);
}

function trainModel() {
  classifier.train((lossValue) => {
    if (lossValue === null) {
      print('Training complete');
      classifyLoop();
    }
  });
}

function classifyLoop() {
  classifier.classify((err, results) => {
    if (!err) {
      currentLabel = results[0].label;
    }
    classifyLoop();
  });
}

Key takeaways

  • ml5.js wraps TensorFlow.js models in a friendly API — include via CDN
  • MoveNet (SINGLEPOSE_LIGHTNING) is the fastest current pose model; use THUNDER for accuracy
  • HandPose provides 21 landmarks per hand; measure finger distances for gesture recognition
  • Image classification (MobileNet) runs continuously via a callback loop
  • Body segmentation produces a mask that separates the person from the background
  • Feature extraction + custom training lets you build Teachable Machine-style classifiers in code