#include "ggml/ggml.h" #include "yolo-image.h" #include #include #include #include #include #include #include #include #if defined(_MSC_VER) #pragma warning(disable: 4244 4267) // possible loss of data #endif struct conv2d_layer { struct ggml_tensor * weights; struct ggml_tensor * biases; struct ggml_tensor * scales; struct ggml_tensor * rolling_mean; struct ggml_tensor * rolling_variance; int padding = 1; bool batch_normalize = true; bool activate = true; // true for leaky relu, false for linear }; struct yolo_model { int width = 416; int height = 416; std::vector conv2d_layers; struct ggml_context * ctx; }; struct yolo_layer { int classes = 80; std::vector mask; std::vector anchors; struct ggml_tensor * predictions; yolo_layer(int classes, const std::vector & mask, const std::vector & anchors, struct ggml_tensor * predictions) : classes(classes), mask(mask), anchors(anchors), predictions(predictions) { } int entry_index(int location, int entry) const { int w = predictions->ne[0]; int h = predictions->ne[1]; int n = location / (w*h); int loc = location % (w*h); return n*w*h*(4+classes+1) + entry*w*h + loc; } }; struct box { float x, y, w, h; }; struct detection { box bbox; std::vector prob; float objectness; }; static bool load_model(const std::string & fname, yolo_model & model) { struct gguf_init_params params = { /*.no_alloc =*/ false, /*.ctx =*/ &model.ctx, }; gguf_context * ctx = gguf_init_from_file(fname.c_str(), params); if (!ctx) { fprintf(stderr, "%s: gguf_init_from_file() failed\n", __func__); return false; } model.width = 416; model.height = 416; model.conv2d_layers.resize(13); model.conv2d_layers[7].padding = 0; model.conv2d_layers[9].padding = 0; model.conv2d_layers[9].batch_normalize = false; model.conv2d_layers[9].activate = false; model.conv2d_layers[10].padding = 0; model.conv2d_layers[12].padding = 0; model.conv2d_layers[12].batch_normalize = false; model.conv2d_layers[12].activate = false; for (int i = 0; i < (int)model.conv2d_layers.size(); i++) { char name[256]; snprintf(name, sizeof(name), "l%d_weights", i); model.conv2d_layers[i].weights = ggml_get_tensor(model.ctx, name); snprintf(name, sizeof(name), "l%d_biases", i); model.conv2d_layers[i].biases = ggml_get_tensor(model.ctx, name); if (model.conv2d_layers[i].batch_normalize) { snprintf(name, sizeof(name), "l%d_scales", i); model.conv2d_layers[i].scales = ggml_get_tensor(model.ctx, name); snprintf(name, sizeof(name), "l%d_rolling_mean", i); model.conv2d_layers[i].rolling_mean = ggml_get_tensor(model.ctx, name); snprintf(name, sizeof(name), "l%d_rolling_variance", i); model.conv2d_layers[i].rolling_variance = ggml_get_tensor(model.ctx, name); } } return true; } static bool load_labels(const char * filename, std::vector & labels) { std::ifstream file_in(filename); if (!file_in) { return false; } std::string line; while (std::getline(file_in, line)) { labels.push_back(line); } GGML_ASSERT(labels.size() == 80); return true; } static bool load_alphabet(std::vector & alphabet) { alphabet.resize(8 * 128); for (int j = 0; j < 8; j++) { for (int i = 32; i < 127; i++) { char fname[256]; sprintf(fname, "data/labels/%d_%d.png", i, j); if (!load_image(fname, alphabet[j*128 + i])) { fprintf(stderr, "Cannot load '%s'\n", fname); return false; } } } return true; } static ggml_tensor * apply_conv2d(ggml_context * ctx, ggml_tensor * input, const conv2d_layer & layer) { struct ggml_tensor * result = ggml_conv_2d(ctx, layer.weights, input, 1, 1, layer.padding, layer.padding, 1, 1); if (layer.batch_normalize) { result = ggml_sub(ctx, result, ggml_repeat(ctx, layer.rolling_mean, result)); result = ggml_div(ctx, result, ggml_sqrt(ctx, ggml_repeat(ctx, layer.rolling_variance, result))); result = ggml_mul(ctx, result, ggml_repeat(ctx, layer.scales, result)); } result = ggml_add(ctx, result, ggml_repeat(ctx, layer.biases, result)); if (layer.activate) { result = ggml_leaky_relu(ctx, result, 0.1f, true); } return result; } static void activate_array(float * x, const int n) { // logistic activation for (int i = 0; i < n; i++) { x[i] = 1./(1. + exp(-x[i])); } } static void apply_yolo(yolo_layer & layer) { int w = layer.predictions->ne[0]; int h = layer.predictions->ne[1]; int N = layer.mask.size(); float * data = ggml_get_data_f32(layer.predictions); for (int n = 0; n < N; n++) { int index = layer.entry_index(n*w*h, 0); activate_array(data + index, 2*w*h); index = layer.entry_index(n*w*h, 4); activate_array(data + index, (1+layer.classes)*w*h); } } static box get_yolo_box(const yolo_layer & layer, int n, int index, int i, int j, int lw, int lh, int w, int h, int stride) { float * predictions = ggml_get_data_f32(layer.predictions); box b; b.x = (i + predictions[index + 0*stride]) / lw; b.y = (j + predictions[index + 1*stride]) / lh; b.w = exp(predictions[index + 2*stride]) * layer.anchors[2*n] / w; b.h = exp(predictions[index + 3*stride]) * layer.anchors[2*n+1] / h; return b; } static void correct_yolo_box(box & b, int im_w, int im_h, int net_w, int net_h) { int new_w = 0; int new_h = 0; if (((float)net_w/im_w) < ((float)net_h/im_h)) { new_w = net_w; new_h = (im_h * net_w)/im_w; } else { new_h = net_h; new_w = (im_w * net_h)/im_h; } b.x = (b.x - (net_w - new_w)/2./net_w) / ((float)new_w/net_w); b.y = (b.y - (net_h - new_h)/2./net_h) / ((float)new_h/net_h); b.w *= (float)net_w/new_w; b.h *= (float)net_h/new_h; } static void get_yolo_detections(const yolo_layer & layer, std::vector & detections, int im_w, int im_h, int netw, int neth, float thresh) { int w = layer.predictions->ne[0]; int h = layer.predictions->ne[1]; int N = layer.mask.size(); float * predictions = ggml_get_data_f32(layer.predictions); std::vector result; for (int i = 0; i < w*h; i++) { for (int n = 0; n < N; n++) { int obj_index = layer.entry_index(n*w*h + i, 4); float objectness = predictions[obj_index]; if (objectness <= thresh) { continue; } detection det; int box_index = layer.entry_index(n*w*h + i, 0); int row = i / w; int col = i % w; det.bbox = get_yolo_box(layer, layer.mask[n], box_index, col, row, w, h, netw, neth, w*h); correct_yolo_box(det.bbox, im_w, im_h, netw, neth); det.objectness = objectness; det.prob.resize(layer.classes); for (int j = 0; j < layer.classes; j++) { int class_index = layer.entry_index(n*w*h + i, 4 + 1 + j); float prob = objectness*predictions[class_index]; det.prob[j] = (prob > thresh) ? prob : 0; } detections.push_back(det); } } } static float overlap(float x1, float w1, float x2, float w2) { float l1 = x1 - w1/2; float l2 = x2 - w2/2; float left = l1 > l2 ? l1 : l2; float r1 = x1 + w1/2; float r2 = x2 + w2/2; float right = r1 < r2 ? r1 : r2; return right - left; } static float box_intersection(const box & a, const box & b) { float w = overlap(a.x, a.w, b.x, b.w); float h = overlap(a.y, a.h, b.y, b.h); if (w < 0 || h < 0) return 0; float area = w*h; return area; } static float box_union(const box & a, const box & b) { float i = box_intersection(a, b); float u = a.w*a.h + b.w*b.h - i; return u; } static float box_iou(const box & a, const box & b) { return box_intersection(a, b)/box_union(a, b); } static void do_nms_sort(std::vector & dets, int classes, float thresh) { int k = (int)dets.size()-1; for (int i = 0; i <= k; ++i) { if (dets[i].objectness == 0) { std::swap(dets[i], dets[k]); --k; --i; } } int total = k+1; for (int k = 0; k < classes; ++k) { std::sort(dets.begin(), dets.begin()+total, [=](const detection & a, const detection & b) { return a.prob[k] > b.prob[k]; }); for (int i = 0; i < total; ++i) { if (dets[i].prob[k] == 0) { continue; } box a = dets[i].bbox; for (int j = i+1; j < total; ++j){ box b = dets[j].bbox; if (box_iou(a, b) > thresh) { dets[j].prob[k] = 0; } } } } } static float get_color(int c, int x, int max) { float colors[6][3] = { {1,0,1}, {0,0,1}, {0,1,1}, {0,1,0}, {1,1,0}, {1,0,0} }; float ratio = ((float)x/max)*5; int i = floor(ratio); int j = ceil(ratio); ratio -= i; float r = (1-ratio) * colors[i][c] + ratio*colors[j][c]; return r; } static void draw_detections(yolo_image & im, const std::vector & dets, float thresh, const std::vector & labels, const std::vector & alphabet) { int classes = (int)labels.size(); for (int i = 0; i < (int)dets.size(); i++) { std::string labelstr; int cl = -1; for (int j = 0; j < (int)dets[i].prob.size(); j++) { if (dets[i].prob[j] > thresh) { if (cl < 0) { labelstr = labels[j]; cl = j; } else { labelstr += ", "; labelstr += labels[j]; } printf("%s: %.0f%%\n", labels[j].c_str(), dets[i].prob[j]*100); } } if (cl >= 0) { int width = im.h * .006; int offset = cl*123457 % classes; float red = get_color(2,offset,classes); float green = get_color(1,offset,classes); float blue = get_color(0,offset,classes); float rgb[3]; rgb[0] = red; rgb[1] = green; rgb[2] = blue; box b = dets[i].bbox; int left = (b.x-b.w/2.)*im.w; int right = (b.x+b.w/2.)*im.w; int top = (b.y-b.h/2.)*im.h; int bot = (b.y+b.h/2.)*im.h; if (left < 0) left = 0; if (right > im.w-1) right = im.w-1; if (top < 0) top = 0; if (bot > im.h-1) bot = im.h-1; draw_box_width(im, left, top, right, bot, width, red, green, blue); yolo_image label = get_label(alphabet, labelstr, (im.h*.03)); draw_label(im, top + width, left, label, rgb); } } } static void print_shape(int layer, const ggml_tensor * t) { printf("Layer %2d output shape: %3d x %3d x %4d x %3d\n", layer, (int)t->ne[0], (int)t->ne[1], (int)t->ne[2], (int)t->ne[3]); } void detect(yolo_image & img, const yolo_model & model, float thresh, const std::vector & labels, const std::vector & alphabet) { static size_t buf_size = 20000000 * sizeof(float) * 4; static void * buf = malloc(buf_size); struct ggml_init_params params = { /*.mem_size =*/ buf_size, /*.mem_buffer =*/ buf, /*.no_alloc =*/ false, }; struct ggml_context * ctx0 = ggml_init(params); struct ggml_cgraph * gf = ggml_new_graph(ctx0); std::vector detections; yolo_image sized = letterbox_image(img, model.width, model.height); struct ggml_tensor * input = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, model.width, model.height, 3, 1); std::memcpy(input->data, sized.data.data(), ggml_nbytes(input)); ggml_set_name(input, "input"); struct ggml_tensor * result = apply_conv2d(ctx0, input, model.conv2d_layers[0]); print_shape(0, result); result = ggml_pool_2d(ctx0, result, GGML_OP_POOL_MAX, 2, 2, 2, 2, 0, 0); print_shape(1, result); result = apply_conv2d(ctx0, result, model.conv2d_layers[1]); print_shape(2, result); result = ggml_pool_2d(ctx0, result, GGML_OP_POOL_MAX, 2, 2, 2, 2, 0, 0); print_shape(3, result); result = apply_conv2d(ctx0, result, model.conv2d_layers[2]); print_shape(4, result); result = ggml_pool_2d(ctx0, result, GGML_OP_POOL_MAX, 2, 2, 2, 2, 0, 0); print_shape(5, result); result = apply_conv2d(ctx0, result, model.conv2d_layers[3]); print_shape(6, result); result = ggml_pool_2d(ctx0, result, GGML_OP_POOL_MAX, 2, 2, 2, 2, 0, 0); print_shape(7, result); result = apply_conv2d(ctx0, result, model.conv2d_layers[4]); struct ggml_tensor * layer_8 = result; print_shape(8, result); result = ggml_pool_2d(ctx0, result, GGML_OP_POOL_MAX, 2, 2, 2, 2, 0, 0); print_shape(9, result); result = apply_conv2d(ctx0, result, model.conv2d_layers[5]); print_shape(10, result); result = ggml_pool_2d(ctx0, result, GGML_OP_POOL_MAX, 2, 2, 1, 1, 0.5, 0.5); print_shape(11, result); result = apply_conv2d(ctx0, result, model.conv2d_layers[6]); print_shape(12, result); result = apply_conv2d(ctx0, result, model.conv2d_layers[7]); struct ggml_tensor * layer_13 = result; print_shape(13, result); result = apply_conv2d(ctx0, result, model.conv2d_layers[8]); print_shape(14, result); result = apply_conv2d(ctx0, result, model.conv2d_layers[9]); struct ggml_tensor * layer_15 = result; print_shape(15, result); result = apply_conv2d(ctx0, layer_13, model.conv2d_layers[10]); print_shape(18, result); result = ggml_upscale(ctx0, result, 2); print_shape(19, result); result = ggml_concat(ctx0, result, layer_8); print_shape(20, result); result = apply_conv2d(ctx0, result, model.conv2d_layers[11]); print_shape(21, result); result = apply_conv2d(ctx0, result, model.conv2d_layers[12]); struct ggml_tensor * layer_22 = result; print_shape(22, result); ggml_build_forward_expand(gf, layer_15); ggml_build_forward_expand(gf, layer_22); ggml_graph_compute_with_ctx(ctx0, gf, 1); yolo_layer yolo16{ 80, {3, 4, 5}, {10, 14, 23, 27, 37,58, 81, 82, 135, 169, 344, 319}, layer_15}; apply_yolo(yolo16); get_yolo_detections(yolo16, detections, img.w, img.h, model.width, model.height, thresh); yolo_layer yolo23{ 80, {0, 1, 2}, {10, 14, 23, 27, 37,58, 81, 82, 135, 169, 344, 319}, layer_22}; apply_yolo(yolo23); get_yolo_detections(yolo23, detections, img.w, img.h, model.width, model.height, thresh); do_nms_sort(detections, yolo23.classes, .45); draw_detections(img, detections, thresh, labels, alphabet); ggml_free(ctx0); } struct yolo_params { float thresh = 0.5; std::string model = "yolov3-tiny.gguf"; std::string fname_inp = "input.jpg"; std::string fname_out = "predictions.jpg"; }; void yolo_print_usage(int argc, char ** argv, const yolo_params & params) { fprintf(stderr, "usage: %s [options]\n", argv[0]); fprintf(stderr, "\n"); fprintf(stderr, "options:\n"); fprintf(stderr, " -h, --help show this help message and exit\n"); fprintf(stderr, " -th T, --thresh T detection threshold (default: %.2f)\n", params.thresh); fprintf(stderr, " -m FNAME, --model FNAME\n"); fprintf(stderr, " model path (default: %s)\n", params.model.c_str()); fprintf(stderr, " -i FNAME, --inp FNAME\n"); fprintf(stderr, " input file (default: %s)\n", params.fname_inp.c_str()); fprintf(stderr, " -o FNAME, --out FNAME\n"); fprintf(stderr, " output file (default: %s)\n", params.fname_out.c_str()); fprintf(stderr, "\n"); } bool yolo_params_parse(int argc, char ** argv, yolo_params & params) { for (int i = 1; i < argc; i++) { std::string arg = argv[i]; if (arg == "-th" || arg == "--thresh") { params.thresh = std::stof(argv[++i]); } else if (arg == "-m" || arg == "--model") { params.model = argv[++i]; } else if (arg == "-i" || arg == "--inp") { params.fname_inp = argv[++i]; } else if (arg == "-o" || arg == "--out") { params.fname_out = argv[++i]; } else if (arg == "-h" || arg == "--help") { yolo_print_usage(argc, argv, params); exit(0); } else { fprintf(stderr, "error: unknown argument: %s\n", arg.c_str()); yolo_print_usage(argc, argv, params); exit(0); } } return true; } int main(int argc, char *argv[]) { ggml_time_init(); yolo_model model; yolo_params params; if (!yolo_params_parse(argc, argv, params)) { return 1; } if (!load_model(params.model, model)) { fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str()); return 1; } yolo_image img(0,0,0); if (!load_image(params.fname_inp.c_str(), img)) { fprintf(stderr, "%s: failed to load image from '%s'\n", __func__, params.fname_inp.c_str()); return 1; } std::vector labels; if (!load_labels("data/coco.names", labels)) { fprintf(stderr, "%s: failed to load labels from 'data/coco.names'\n", __func__); return 1; } std::vector alphabet; if (!load_alphabet(alphabet)) { fprintf(stderr, "%s: failed to load alphabet\n", __func__); return 1; } const int64_t t_start_ms = ggml_time_ms(); detect(img, model, params.thresh, labels, alphabet); const int64_t t_detect_ms = ggml_time_ms() - t_start_ms; if (!save_image(img, params.fname_out.c_str(), 80)) { fprintf(stderr, "%s: failed to save image to '%s'\n", __func__, params.fname_out.c_str()); return 1; } printf("Detected objects saved in '%s' (time: %f sec.)\n", params.fname_out.c_str(), t_detect_ms / 1000.0f); ggml_free(model.ctx); return 0; }