From 74ce1d2d11a56b522a343d944bf5b184bb2d9212 Mon Sep 17 00:00:00 2001
From: Wenbin Chen <wenbin.chen@intel.com>
Date: Thu, 21 Sep 2023 09:26:31 +0800
Subject: [PATCH] libavfilter/dnn: add layout option to openvino backend

Dnn models have different input layout (NCHW or NHWC), so a
"layout" option is added
Use openvino's API to do layout conversion for input data. Use swscale
to do layout conversion for output data as openvino doesn't have
similiar C API for output.

Signed-off-by: Wenbin Chen <wenbin.chen@intel.com>
---
 libavfilter/dnn/dnn_backend_openvino.c |  47 +++++++-
 libavfilter/dnn/dnn_io_proc.c          | 151 ++++++++++++++++++++++---
 libavfilter/dnn_interface.h            |   7 ++
 3 files changed, 185 insertions(+), 20 deletions(-)

diff --git a/libavfilter/dnn/dnn_backend_openvino.c b/libavfilter/dnn/dnn_backend_openvino.c
index ded156289b..ae53488837 100644
--- a/libavfilter/dnn/dnn_backend_openvino.c
+++ b/libavfilter/dnn/dnn_backend_openvino.c
@@ -45,6 +45,7 @@ typedef struct OVOptions{
     uint8_t async;
     int batch_size;
     int input_resizable;
+    DNNLayout layout;
 } OVOptions;
 
 typedef struct OVContext {
@@ -100,6 +101,10 @@ static const AVOption dnn_openvino_options[] = {
     DNN_BACKEND_COMMON_OPTIONS
     { "batch_size",  "batch size per request", OFFSET(options.batch_size),  AV_OPT_TYPE_INT,    { .i64 = 1 },     1, 1000, FLAGS},
     { "input_resizable", "can input be resizable or not", OFFSET(options.input_resizable), AV_OPT_TYPE_BOOL,   { .i64 = 0 },     0, 1, FLAGS },
+    { "layout", "input layout of model", OFFSET(options.layout), AV_OPT_TYPE_INT, { .i64 = DL_NONE}, DL_NONE, DL_NHWC, FLAGS, "layout" },
+        { "none",  "none", 0, AV_OPT_TYPE_CONST, { .i64 = DL_NONE }, 0, 0, FLAGS, "layout"},
+        { "nchw",  "nchw", 0, AV_OPT_TYPE_CONST, { .i64 = DL_NCHW }, 0, 0, FLAGS, "layout"},
+        { "nhwc",  "nhwc", 0, AV_OPT_TYPE_CONST, { .i64 = DL_NHWC }, 0, 0, FLAGS, "layout"},
     { NULL }
 };
 
@@ -235,9 +240,9 @@ static int fill_model_input_ov(OVModel *ov_model, OVRequestItem *request)
         avpriv_report_missing_feature(ctx, "Do not support dynamic model.");
         return AVERROR(ENOSYS);
     }
-    input.height = dims[2];
-    input.width = dims[3];
-    input.channels = dims[1];
+    input.height = dims[1];
+    input.width = dims[2];
+    input.channels = dims[3];
     input.dt = precision_to_datatype(precision);
     input.data = av_malloc(input.height * input.width * input.channels * get_datatype_size(input.dt));
     if (!input.data) {
@@ -412,6 +417,7 @@ static void infer_completion_callback(void *args)
     av_assert0(request->lltask_count <= dims.dims[0]);
 #endif
     output.dt       = precision_to_datatype(precision);
+    output.layout   = ctx->options.layout;
 
     av_assert0(request->lltask_count >= 1);
     for (int i = 0; i < request->lltask_count; ++i) {
@@ -540,11 +546,14 @@ static int init_model_ov(OVModel *ov_model, const char *input_name, const char *
     OVContext *ctx = &ov_model->ctx;
 #if HAVE_OPENVINO2
     ov_status_e status;
-    ov_preprocess_input_tensor_info_t* input_tensor_info;
-    ov_preprocess_output_tensor_info_t* output_tensor_info;
+    ov_preprocess_input_tensor_info_t* input_tensor_info = NULL;
+    ov_preprocess_output_tensor_info_t* output_tensor_info = NULL;
+    ov_preprocess_input_model_info_t* input_model_info = NULL;
     ov_model_t *tmp_ov_model;
     ov_layout_t* NHWC_layout = NULL;
+    ov_layout_t* NCHW_layout = NULL;
     const char* NHWC_desc = "NHWC";
+    const char* NCHW_desc = "NCHW";
     const char* device = ctx->options.device_type;
 #else
     IEStatusCode status;
@@ -589,6 +598,7 @@ static int init_model_ov(OVModel *ov_model, const char *input_name, const char *
 
     //set input layout
     status = ov_layout_create(NHWC_desc, &NHWC_layout);
+    status |= ov_layout_create(NCHW_desc, &NCHW_layout);
     if (status != OK) {
         av_log(ctx, AV_LOG_ERROR, "Failed to create layout for input.\n");
         ret = ov2_map_error(status, NULL);
@@ -602,6 +612,22 @@ static int init_model_ov(OVModel *ov_model, const char *input_name, const char *
         goto err;
     }
 
+    status = ov_preprocess_input_info_get_model_info(ov_model->input_info, &input_model_info);
+    if (status != OK) {
+        av_log(ctx, AV_LOG_ERROR, "Failed to get input model info\n");
+        ret = ov2_map_error(status, NULL);
+        goto err;
+    }
+    if (ctx->options.layout == DL_NCHW)
+        status = ov_preprocess_input_model_info_set_layout(input_model_info, NCHW_layout);
+    else if (ctx->options.layout == DL_NHWC)
+        status = ov_preprocess_input_model_info_set_layout(input_model_info, NHWC_layout);
+    if (status != OK) {
+        av_log(ctx, AV_LOG_ERROR, "Failed to get set input model layout\n");
+        ret = ov2_map_error(status, NULL);
+        goto err;
+    }
+
     if (ov_model->model->func_type != DFT_PROCESS_FRAME)
         //set precision only for detect and classify
         status = ov_preprocess_input_tensor_info_set_element_type(input_tensor_info, U8);
@@ -639,6 +665,9 @@ static int init_model_ov(OVModel *ov_model, const char *input_name, const char *
         ret = ov2_map_error(status, NULL);
         goto err;
     }
+    ov_preprocess_input_model_info_free(input_model_info);
+    ov_layout_free(NCHW_layout);
+    ov_layout_free(NHWC_layout);
 #else
     if (ctx->options.batch_size > 1) {
         input_shapes_t input_shapes;
@@ -783,6 +812,14 @@ static int init_model_ov(OVModel *ov_model, const char *input_name, const char *
     return 0;
 
 err:
+#if HAVE_OPENVINO2
+    if (NCHW_layout)
+        ov_layout_free(NCHW_layout);
+    if (NHWC_layout)
+        ov_layout_free(NHWC_layout);
+    if (input_model_info)
+        ov_preprocess_input_model_info_free(input_model_info);
+#endif
     dnn_free_model_ov(&ov_model->model);
     return ret;
 }
diff --git a/libavfilter/dnn/dnn_io_proc.c b/libavfilter/dnn/dnn_io_proc.c
index 7961bf6b95..dfa0d5e5da 100644
--- a/libavfilter/dnn/dnn_io_proc.c
+++ b/libavfilter/dnn/dnn_io_proc.c
@@ -27,6 +27,12 @@
 int ff_proc_from_dnn_to_frame(AVFrame *frame, DNNData *output, void *log_ctx)
 {
     struct SwsContext *sws_ctx;
+    int ret = 0;
+    int linesize[4] = { 0 };
+    void **dst_data = NULL;
+    void *middle_data = NULL;
+    uint8_t *planar_data[4] = { 0 };
+    int plane_size = frame->width * frame->height * sizeof(uint8_t);
     int bytewidth = av_image_get_linesize(frame->format, frame->width, 0);
     if (bytewidth < 0) {
         return AVERROR(EINVAL);
@@ -35,6 +41,17 @@ int ff_proc_from_dnn_to_frame(AVFrame *frame, DNNData *output, void *log_ctx)
         avpriv_report_missing_feature(log_ctx, "data type rather than DNN_FLOAT");
         return AVERROR(ENOSYS);
     }
+    dst_data = (void **)frame->data;
+    linesize[0] = frame->linesize[0];
+    if (output->layout == DL_NCHW) {
+        middle_data = av_malloc(plane_size * output->channels);
+        if (!middle_data) {
+            ret = AVERROR(ENOMEM);
+            goto err;
+        }
+        dst_data = &middle_data;
+        linesize[0] = frame->width * 3;
+    }
 
     switch (frame->format) {
     case AV_PIX_FMT_RGB24:
@@ -51,18 +68,52 @@ int ff_proc_from_dnn_to_frame(AVFrame *frame, DNNData *output, void *log_ctx)
                 "fmt:%s s:%dx%d -> fmt:%s s:%dx%d\n",
                 av_get_pix_fmt_name(AV_PIX_FMT_GRAYF32), frame->width * 3, frame->height,
                 av_get_pix_fmt_name(AV_PIX_FMT_GRAY8),   frame->width * 3, frame->height);
-            return AVERROR(EINVAL);
+            ret = AVERROR(EINVAL);
+            goto err;
         }
         sws_scale(sws_ctx, (const uint8_t *[4]){(const uint8_t *)output->data, 0, 0, 0},
                            (const int[4]){frame->width * 3 * sizeof(float), 0, 0, 0}, 0, frame->height,
-                           (uint8_t * const*)frame->data, frame->linesize);
+                           (uint8_t * const*)dst_data, linesize);
         sws_freeContext(sws_ctx);
-        return 0;
+        // convert data from planar to packed
+        if (output->layout == DL_NCHW) {
+            sws_ctx = sws_getContext(frame->width,
+                                     frame->height,
+                                     AV_PIX_FMT_GBRP,
+                                     frame->width,
+                                     frame->height,
+                                     frame->format,
+                                     0, NULL, NULL, NULL);
+            if (!sws_ctx) {
+                av_log(log_ctx, AV_LOG_ERROR, "Impossible to create scale context for the conversion "
+                       "fmt:%s s:%dx%d -> fmt:%s s:%dx%d\n",
+                       av_get_pix_fmt_name(AV_PIX_FMT_GBRP), frame->width, frame->height,
+                       av_get_pix_fmt_name(frame->format),frame->width, frame->height);
+                ret = AVERROR(EINVAL);
+                goto err;
+            }
+            if (frame->format == AV_PIX_FMT_RGB24) {
+                planar_data[0] = (uint8_t *)middle_data + plane_size;
+                planar_data[1] = (uint8_t *)middle_data + plane_size * 2;
+                planar_data[2] = (uint8_t *)middle_data;
+            } else if (frame->format == AV_PIX_FMT_BGR24) {
+                planar_data[0] = (uint8_t *)middle_data + plane_size;
+                planar_data[1] = (uint8_t *)middle_data;
+                planar_data[2] = (uint8_t *)middle_data + plane_size * 2;
+            }
+            sws_scale(sws_ctx, (const uint8_t * const *)planar_data,
+                      (const int [4]){frame->width * sizeof(uint8_t),
+                                      frame->width * sizeof(uint8_t),
+                                      frame->width * sizeof(uint8_t), 0},
+                      0, frame->height, frame->data, frame->linesize);
+            sws_freeContext(sws_ctx);
+        }
+        break;
     case AV_PIX_FMT_GRAYF32:
         av_image_copy_plane(frame->data[0], frame->linesize[0],
                             output->data, bytewidth,
                             bytewidth, frame->height);
-        return 0;
+        break;
     case AV_PIX_FMT_YUV420P:
     case AV_PIX_FMT_YUV422P:
     case AV_PIX_FMT_YUV444P:
@@ -82,24 +133,34 @@ int ff_proc_from_dnn_to_frame(AVFrame *frame, DNNData *output, void *log_ctx)
                 "fmt:%s s:%dx%d -> fmt:%s s:%dx%d\n",
                 av_get_pix_fmt_name(AV_PIX_FMT_GRAYF32), frame->width, frame->height,
                 av_get_pix_fmt_name(AV_PIX_FMT_GRAY8),   frame->width, frame->height);
-            return AVERROR(EINVAL);
+            ret = AVERROR(EINVAL);
+            goto err;
         }
         sws_scale(sws_ctx, (const uint8_t *[4]){(const uint8_t *)output->data, 0, 0, 0},
                            (const int[4]){frame->width * sizeof(float), 0, 0, 0}, 0, frame->height,
                            (uint8_t * const*)frame->data, frame->linesize);
         sws_freeContext(sws_ctx);
-        return 0;
+        break;
     default:
         avpriv_report_missing_feature(log_ctx, "%s", av_get_pix_fmt_name(frame->format));
-        return AVERROR(ENOSYS);
+        ret = AVERROR(ENOSYS);
+        goto err;
     }
 
-    return 0;
+err:
+    av_free(middle_data);
+    return ret;
 }
 
 int ff_proc_from_frame_to_dnn(AVFrame *frame, DNNData *input, void *log_ctx)
 {
     struct SwsContext *sws_ctx;
+    int ret = 0;
+    int linesize[4] = { 0 };
+    void **src_data = NULL;
+    void *middle_data = NULL;
+    uint8_t *planar_data[4] = { 0 };
+    int plane_size = frame->width * frame->height * sizeof(uint8_t);
     int bytewidth = av_image_get_linesize(frame->format, frame->width, 0);
     if (bytewidth < 0) {
         return AVERROR(EINVAL);
@@ -109,9 +170,54 @@ int ff_proc_from_frame_to_dnn(AVFrame *frame, DNNData *input, void *log_ctx)
         return AVERROR(ENOSYS);
     }
 
+    src_data = (void **)frame->data;
+    linesize[0] = frame->linesize[0];
+    if (input->layout == DL_NCHW) {
+        middle_data = av_malloc(plane_size * input->channels);
+        if (!middle_data) {
+            ret = AVERROR(ENOMEM);
+            goto err;
+        }
+        src_data = &middle_data;
+        linesize[0] = frame->width * 3;
+    }
+
     switch (frame->format) {
     case AV_PIX_FMT_RGB24:
     case AV_PIX_FMT_BGR24:
+        // convert data from planar to packed
+        if (input->layout == DL_NCHW) {
+            sws_ctx = sws_getContext(frame->width,
+                                     frame->height,
+                                     frame->format,
+                                     frame->width,
+                                     frame->height,
+                                     AV_PIX_FMT_GBRP,
+                                     0, NULL, NULL, NULL);
+            if (!sws_ctx) {
+                av_log(log_ctx, AV_LOG_ERROR, "Impossible to create scale context for the conversion "
+                       "fmt:%s s:%dx%d -> fmt:%s s:%dx%d\n",
+                       av_get_pix_fmt_name(frame->format), frame->width, frame->height,
+                       av_get_pix_fmt_name(AV_PIX_FMT_GBRP),frame->width, frame->height);
+                ret = AVERROR(EINVAL);
+                goto err;
+            }
+            if (frame->format == AV_PIX_FMT_RGB24) {
+                planar_data[0] = (uint8_t *)middle_data + plane_size;
+                planar_data[1] = (uint8_t *)middle_data + plane_size * 2;
+                planar_data[2] = (uint8_t *)middle_data;
+            } else if (frame->format == AV_PIX_FMT_BGR24) {
+                planar_data[0] = (uint8_t *)middle_data + plane_size;
+                planar_data[1] = (uint8_t *)middle_data;
+                planar_data[2] = (uint8_t *)middle_data + plane_size * 2;
+            }
+            sws_scale(sws_ctx, (const uint8_t * const *)frame->data,
+                      frame->linesize, 0, frame->height, planar_data,
+                      (const int [4]){frame->width * sizeof(uint8_t),
+                                      frame->width * sizeof(uint8_t),
+                                      frame->width * sizeof(uint8_t), 0});
+            sws_freeContext(sws_ctx);
+        }
         sws_ctx = sws_getContext(frame->width * 3,
                                  frame->height,
                                  AV_PIX_FMT_GRAY8,
@@ -124,10 +230,11 @@ int ff_proc_from_frame_to_dnn(AVFrame *frame, DNNData *input, void *log_ctx)
                 "fmt:%s s:%dx%d -> fmt:%s s:%dx%d\n",
                 av_get_pix_fmt_name(AV_PIX_FMT_GRAY8),  frame->width * 3, frame->height,
                 av_get_pix_fmt_name(AV_PIX_FMT_GRAYF32),frame->width * 3, frame->height);
-            return AVERROR(EINVAL);
+            ret = AVERROR(EINVAL);
+            goto err;
         }
-        sws_scale(sws_ctx, (const uint8_t **)frame->data,
-                           frame->linesize, 0, frame->height,
+        sws_scale(sws_ctx, (const uint8_t **)src_data,
+                           linesize, 0, frame->height,
                            (uint8_t * const [4]){input->data, 0, 0, 0},
                            (const int [4]){frame->width * 3 * sizeof(float), 0, 0, 0});
         sws_freeContext(sws_ctx);
@@ -156,7 +263,8 @@ int ff_proc_from_frame_to_dnn(AVFrame *frame, DNNData *input, void *log_ctx)
                 "fmt:%s s:%dx%d -> fmt:%s s:%dx%d\n",
                 av_get_pix_fmt_name(AV_PIX_FMT_GRAY8),  frame->width, frame->height,
                 av_get_pix_fmt_name(AV_PIX_FMT_GRAYF32),frame->width, frame->height);
-            return AVERROR(EINVAL);
+            ret = AVERROR(EINVAL);
+            goto err;
         }
         sws_scale(sws_ctx, (const uint8_t **)frame->data,
                            frame->linesize, 0, frame->height,
@@ -166,10 +274,12 @@ int ff_proc_from_frame_to_dnn(AVFrame *frame, DNNData *input, void *log_ctx)
         break;
     default:
         avpriv_report_missing_feature(log_ctx, "%s", av_get_pix_fmt_name(frame->format));
-        return AVERROR(ENOSYS);
+        ret = AVERROR(ENOSYS);
+        goto err;
     }
-
-    return 0;
+err:
+    av_free(middle_data);
+    return ret;
 }
 
 static enum AVPixelFormat get_pixel_format(DNNData *data)
@@ -205,6 +315,11 @@ int ff_frame_to_dnn_classify(AVFrame *frame, DNNData *input, uint32_t bbox_index
     AVFrameSideData *sd = av_frame_get_side_data(frame, AV_FRAME_DATA_DETECTION_BBOXES);
     av_assert0(sd);
 
+    if (input->layout == DL_NCHW) {
+        av_log(log_ctx, AV_LOG_ERROR, "dnn_classify input data doesn't support layout: NCHW\n");
+        return AVERROR(ENOSYS);
+    }
+
     header = (const AVDetectionBBoxHeader *)sd->data;
     bbox = av_get_detection_bbox(header, bbox_index);
 
@@ -257,6 +372,12 @@ int ff_frame_to_dnn_detect(AVFrame *frame, DNNData *input, void *log_ctx)
     int linesizes[4];
     int ret = 0;
     enum AVPixelFormat fmt = get_pixel_format(input);
+
+    if (input->layout == DL_NCHW) {
+        av_log(log_ctx, AV_LOG_ERROR, "dnn_detect input data doesn't support layout: NCHW\n");
+        return AVERROR(ENOSYS);
+    }
+
     sws_ctx = sws_getContext(frame->width, frame->height, frame->format,
                              input->width, input->height, fmt,
                              SWS_FAST_BILINEAR, NULL, NULL, NULL);
diff --git a/libavfilter/dnn_interface.h b/libavfilter/dnn_interface.h
index 20c6a0a896..956a63443a 100644
--- a/libavfilter/dnn_interface.h
+++ b/libavfilter/dnn_interface.h
@@ -56,12 +56,19 @@ typedef enum {
     DFT_ANALYTICS_CLASSIFY, // classify for each bounding box
 }DNNFunctionType;
 
+typedef enum {
+    DL_NONE,
+    DL_NCHW,
+    DL_NHWC,
+} DNNLayout;
+
 typedef struct DNNData{
     void *data;
     int width, height, channels;
     // dt and order together decide the color format
     DNNDataType dt;
     DNNColorOrder order;
+    DNNLayout layout;
 } DNNData;
 
 typedef struct DNNExecBaseParams {