0%
November 14, 2022

Onnx Model Deployment to Frontend From Pytorch Model

deep-learning

pytorch

react

Repository

Script to Convert Pytorch Model into ONNX model

Let's take a model called RetinaFace as an example. Given that we have a target model that has been trained, we convert it into onnx model by pytorch built-in command:

import torch

retina_face = RetinaFace(cfg=cfg)
retina_face.eval()
regression_parser = SimplifiedRegressionParser()
regression_parser.eval()

dummy_inputs_retina_face = torch.randn(
    1, 3, config.input_img_size, config.input_img_size
)
dummy_inputs_regression_parser = (
    torch.randn(1, config.n_priors, 4),
    torch.randn(1, config.n_priors),
    torch.randn(1, config.n_priors, 196)
)

torch.onnx.export(
    retina_face,
    dummy_inputs_retina_face,
    'FaceDetector.onnx',
    export_params=True,
    verbose=False,
    input_names=["inputImage"],
    output_names=["bbox_regressions", "scores", "ldm_regressions"],
    opset_version=11
)
torch.onnx.export(
    regression_parser,
    args=dummy_inputs_regression_parser,
    f='RegressionParser.onnx',
    export_params=True,
    verbose=False,
    input_names=["bbox_regressions", "scores", "landm_regressions"],
    output_names=["boxes", "scores", "landms"],
    opset_version=11
)

Script to Deploy the ONNX model in Frontend

Loading Detection Head Model and Prediction Parser Model

Let's create a file utils/modelUtils. In this inference we break our model into two parts

  • faceSession
  • faceRegressionSession

and load them via:

// utils/modelUtils.ts

import * as ort from "onnxruntime-web";
import { useEffect, useState, Dispatch, SetStateAction } from "react";
import { getImageTensorFromPath } from "./imageUtils";
import { time } from "./timeUtils";

export let faceSession: ort.InferenceSession;
export let faceRegressionSession: ort.InferenceSession;

export const loadModels = async () => {
  faceSession = await ort.InferenceSession.create(faceUrl, {
    executionProviders: ["webgl"],
    graphOptimizationLevel: "all",
  });

  faceRegressionSession = await ort.InferenceSession.create(regressionUrl, {
    executionProviders: ["wasm"],
    graphOptimizationLevel: "all",
  });
};
Why the Hack we Need two Models?

As can be seen in the parameter executionProviders above, we can choose to use webgl and wasm backend for our model. webgl employes GPU in the inference, but it comes with limitations:

  • webgl does not support all operators in the given opset_version
  • Many, many usual operators have no support in webgl
  • A complete list of supported operators can be found in this link

The way to get around this problem is:

  • Unsupported operators should be moved into another model
  • wasm supports all operators in a given opset_version
  • Therefore we load that additional model with wasm backend in order to execute those unsupported operators
Script to Execute Prediction

There are many image utils functions, you can unfold the following when you are curious:

Click to inspect utils/imageUtils.ts
// utils/imageUtils.ts

import * as Jimp from 'jimp';
import { Tensor } from 'onnxruntime-web';
import { getConfig } from './configUtils';

const config = getConfig();

export async function getImageTensorFromPath(
  path: string,
  dims: number[] = [
    1,
    3,
    config.modelRequiredSizes?.width || 640,
    config.modelRequiredSizes?.height || 640
  ]
): Promise<Tensor> {
  var image = await loadImagefromPath(path, dims[2], dims[3]);
  var imageTensor = imageDataToTensor(image, dims);
  return imageTensor;
}

async function loadImagefromPath(
  path: string,
  width: number = config.modelRequiredSizes?.width || 640,
  height: number = config.modelRequiredSizes?.height || 640
): Promise<Jimp> {
  // Use Jimp to load the image and resize it.
  var imageData = await Jimp.default.read(path).then((imageBuffer: Jimp) => {
    return imageBuffer
  });

  return imageData;
}
function imageDataToTensor(image: Jimp, dims: number[]): Tensor {
  // 1. Get buffer data from image and create R, G, and B arrays.
  var imageBufferData = image.bitmap.data;
  const [redArray, greenArray, blueArray] = new Array(new Array<number>(), new Array<number>(), new Array<number>());

  // 2. Loop through the image buffer and extract the R, G, and B channels
  for (let i = 0; i < imageBufferData.length; i += 4) {
    redArray.push(imageBufferData[i]);
    greenArray.push(imageBufferData[i + 1]);
    blueArray.push(imageBufferData[i + 2]);
    // skip data[i + 3] to filter out the alpha channel
  }

  // 3. Concatenate RGB to transpose [224, 224, 3] -> [3, 224, 224] to a number array
  const transposedData = redArray.concat(greenArray).concat(blueArray);

  // 4. convert to float32
  let i, l = transposedData.length; // length, we need this for the loop
  // create the Float32Array size 3 * 224 * 224 for these dimensions output
  const float32Data = new Float32Array(dims[1] * dims[2] * dims[3]);
  for (i = 0; i < l; i++) {
    float32Data[i] = transposedData[i] / 255.0; // convert to float
  }
  // 5. create the tensor object from onnxruntime-web.
  const inputTensor = new Tensor("float32", float32Data, dims);
  return inputTensor;
}

// utils/modelUtils.ts

import * as ort from 'onnxruntime-web';
import { useEffect, useState, Dispatch, SetStateAction } from 'react';
import { getImageTensorFromPath } from './imageUtils';
import { time } from './timeUtils';

export const getPredictionFromImagePath = async (imgUrl: string): Promise<{
  hasPositiveResult: boolean,
  result: {
    inferenceTime: number,
    regressionTime: number,
    box: number[],
    landm: number[],
    scores: number[]
  }
}> => {
  if (faceSession && faceRegressionSession) {

    const inferenceStartTime = time.time()
    const imgTensor = await getImageTensorFromPath(imgUrl || "")
    const feeds: Record<string, ort.Tensor> = {};
    feeds[faceSession.inputNames[0]] = imgTensor;
    const outputData = await faceSession.run(feeds);
    const output0 = outputData[faceSession.outputNames[0]];
    const output1 = outputData[faceSession.outputNames[1]];
    const output2 = outputData[faceSession.outputNames[2]];
    const inferenceEndtime = time.time()

    const inferenceTime = inferenceEndtime - inferenceStartTime;

    const feeds_: Record<string, ort.Tensor> = {};
    feeds_[faceRegressionSession.inputNames[0]] = output0;
    feeds_[faceRegressionSession.inputNames[1]] = output1;
    feeds_[faceRegressionSession.inputNames[2]] = output2;

    const regressionStartTime = time.time();
    const output = await faceRegressionSession.run(feeds_);
    const regressionEndtime = time.time();
    const regressionTime = regressionEndtime - regressionStartTime

    const { boxes, landms, scores } = output;
    const hasPositiveResult = boxes.data.length > 0 && boxes && landms && scores;

    if (hasPositiveResult) {
      return ({
        hasPositiveResult: true,
        result: {
          inferenceTime,
          regressionTime,
          box: (boxes.data as any) as number[],
          landm: (landms.data as any) as number[],
          scores: (scores.data as any) as number[]
        }
      })
    }
  }
  return ({
    hasPositiveResult: false,
    result: {
      inferenceTime: 0,
      regressionTime: 0,
      box: [],
      landm: [],
      scores: [],
    }
  })
}
Start the Prediction Loop and Draw Result in Canvas
const startPrediction_ = async () => {
  if (continuePredictionRef.current) {
    const webcamRef = webcamVideoRef.current;
    const video = webcamRef?.video;
    const canvas = canvasRef.current;

    if (video && canvas) {
      if (canvasWidthHeight.width == 0 && canvasWidthHeight.height == 0) {
        // both are zero when being initialized like after changing the camera device
        setCanvasWidthHeight({
          width: video?.videoWidth,
          height: video?.videoHeight,
        });
      }
      const imgUrl = webcamRef.getScreenshot(
        config?.modelRequiredSizes || { width: 640, height: 640 }
      );
      const ctx = canvas.getContext("2d");
      if (ctx) {
        if (imgUrl) {
          const prediction = await getPredictionFromImagePath(imgUrl);
          const { hasPositiveResult, result } = prediction;
          if (hasPositiveResult) {
            const { box, landm, scores, inferenceTime, regressionTime } =
              result;
            setInfTimes({ inferenceTime, regressionTime });
            // shape check:
            if (box.length == 4 && landm.length == 196 && scores.length == 1) {
              setScore(scores[0]);
              const [xmin, ymin, xmax, ymax] = box;
              ctx.clearRect(0, 0, video?.videoWidth, video?.videoHeight);
              drawRectangle(ctx, xmin, ymin, xmax, ymax);
              for (let i = 0; i < 98; i++) {
                const x = landm[2 * i];
                const y = landm[2 * i + 1];
                drawDot(ctx, x, y);
              }
            }
          }
        }
      }
    }
    await startPrediction_();
  }
  onPredictionStop();
};