cmdr2 commited on
Commit
67e8c32
·
1 Parent(s): 3d4f29c

cuda/cpu: Increase support for fp16 unary operations (ggml/1125)

Browse files

* Support fp16 unary operations in the CUDA backend

* cpu: increase fp16 support for unary operators in the CPU backend

* cuda: increase fp16 support for unary operators in the CUDA backend

* Add test cases for fp16 unary operators

* metal: update supports_op for unary operators that don't support fp16, to prevent test-backend-ops from failing

* metal: fix PR comments for unary op support after fp16 unary tests

ggml/src/ggml-cpu/ggml-cpu.c CHANGED
@@ -1432,6 +1432,12 @@ inline static void ggml_vec_sub_f16 (const int n, ggml_fp16_t * z, const ggml_fp
1432
  inline static void ggml_vec_set_f32 (const int n, float * x, const float v) { for (int i = 0; i < n; ++i) x[i] = v; }
1433
  inline static void ggml_vec_cpy_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i]; }
1434
  inline static void ggml_vec_neg_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = -x[i]; }
 
 
 
 
 
 
1435
  inline static void ggml_vec_mul_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i]*y[i]; }
1436
  inline static void ggml_vec_mul_f16 (const int n, ggml_fp16_t * z, const ggml_fp16_t * x, const ggml_fp16_t * y) {
1437
  for (int i = 0; i < n; ++i) {
@@ -1830,22 +1836,107 @@ inline static void ggml_vec_scale_f16(const int n, ggml_fp16_t * y, const float
1830
 
1831
  inline static void ggml_vec_norm_f32 (const int n, float * s, const float * x) { ggml_vec_dot_f32(n, s, 0, x, 0, x, 0, 1); *s = sqrtf(*s); }
1832
  inline static void ggml_vec_sqr_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i]*x[i]; }
 
 
 
 
 
 
1833
  inline static void ggml_vec_sqrt_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = sqrtf(x[i]); }
 
 
 
 
 
1834
  inline static void ggml_vec_log_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = logf(x[i]); }
 
 
 
 
 
1835
  inline static void ggml_vec_sin_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = sinf(x[i]); }
 
 
 
 
 
1836
  inline static void ggml_vec_cos_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = cosf(x[i]); }
 
 
 
 
 
1837
  inline static void ggml_vec_abs_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = fabsf(x[i]); }
 
 
 
 
 
1838
  inline static void ggml_vec_sgn_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? 1.f : ((x[i] < 0.f) ? -1.f : 0.f); }
 
 
 
 
 
 
1839
  inline static void ggml_vec_step_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? 1.f : 0.f; }
 
 
 
 
 
1840
  inline static void ggml_vec_tanh_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = tanhf(x[i]); }
 
 
 
 
 
1841
  inline static void ggml_vec_elu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : expm1f(x[i]); }
 
 
 
 
 
1842
  inline static void ggml_vec_relu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.f; }
 
 
 
 
 
 
1843
  inline static void ggml_vec_leaky_relu_f32 (const int n, float * y, const float * x, const float ns) { for (int i = 0; i < n; ++i) y[i] = ((x[i] > 0.f) ? x[i] : 0.f) + ns * ((x[i] < 0.0f) ? x[i] : 0.f); }
 
 
 
 
 
 
1844
  inline static void ggml_vec_sigmoid_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = 1.f / (1.f + expf(-x[i])); }
 
 
 
 
 
1845
  // TODO: optimize performance
1846
  inline static void ggml_vec_hardswish_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i] * fminf(1.0f, fmaxf(0.0f, (x[i] + 3.0f) / 6.0f)); }
 
 
 
 
 
 
1847
  inline static void ggml_vec_hardsigmoid_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = fminf(1.0f, fmaxf(0.0f, (x[i] + 3.0f) / 6.0f)); }
 
 
 
 
 
1848
  inline static void ggml_vec_exp_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = expf(x[i]); }
 
 
 
 
 
1849
 
1850
  static const float GELU_COEF_A = 0.044715f;
1851
  static const float GELU_QUICK_COEF = -1.702f;
@@ -1913,10 +2004,21 @@ inline static void ggml_vec_gelu_quick_f32(const int n, float * y, const float *
1913
  }
1914
  #endif
1915
 
 
 
 
 
 
 
 
1916
  // Sigmoid Linear Unit (SiLU) function
1917
  inline static float ggml_silu_f32(float x) {
1918
  return x/(1.0f + expf(-x));
1919
  }
 
 
 
 
1920
 
1921
  #if __FINITE_MATH_ONLY__
1922
  #error "some routines in ggml.c require non-finite math arithmetics -- pass -fno-finite-math-only to the compiler to fix"
@@ -2140,6 +2242,12 @@ static void ggml_vec_silu_f32(const int n, float * y, const float * x) {
2140
  }
2141
  }
2142
 
 
 
 
 
 
 
2143
  static ggml_float ggml_vec_soft_max_f32(const int n, float * y, const float * x, float max) {
2144
  int i = 0;
2145
  ggml_float sum = 0;
@@ -2211,12 +2319,24 @@ inline static float ggml_silu_backward_f32(float x, float dy) {
2211
  return dy*s*(1.0f + x*(1.0f - s));
2212
  }
2213
 
 
 
 
 
 
 
2214
  inline static void ggml_vec_silu_backward_f32(const int n, float * dx, const float * x, const float * dy) {
2215
  for (int i = 0; i < n; ++i) {
2216
  dx[i] = ggml_silu_backward_f32(x[i], dy[i]);
2217
  }
2218
  }
2219
 
 
 
 
 
 
 
2220
  inline static void ggml_vec_sum_f32(const int n, float * s, const float * x) {
2221
  #ifndef GGML_USE_ACCELERATE
2222
  ggml_float sum = 0.0;
@@ -5623,6 +5743,31 @@ static void ggml_compute_forward_sqr_f32(
5623
  }
5624
  }
5625
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5626
  static void ggml_compute_forward_sqr(
5627
  const struct ggml_compute_params * params,
5628
  struct ggml_tensor * dst) {
@@ -5634,6 +5779,10 @@ static void ggml_compute_forward_sqr(
5634
  {
5635
  ggml_compute_forward_sqr_f32(params, dst);
5636
  } break;
 
 
 
 
5637
  default:
5638
  {
5639
  GGML_ABORT("fatal error");
@@ -5668,6 +5817,31 @@ static void ggml_compute_forward_sqrt_f32(
5668
  }
5669
  }
5670
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5671
  static void ggml_compute_forward_sqrt(
5672
  const struct ggml_compute_params * params,
5673
  struct ggml_tensor * dst) {
@@ -5679,6 +5853,10 @@ static void ggml_compute_forward_sqrt(
5679
  {
5680
  ggml_compute_forward_sqrt_f32(params, dst);
5681
  } break;
 
 
 
 
5682
  default:
5683
  {
5684
  GGML_ABORT("fatal error");
@@ -5713,6 +5891,31 @@ static void ggml_compute_forward_log_f32(
5713
  }
5714
  }
5715
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5716
  static void ggml_compute_forward_log(
5717
  const struct ggml_compute_params * params,
5718
  struct ggml_tensor * dst) {
@@ -5724,6 +5927,10 @@ static void ggml_compute_forward_log(
5724
  {
5725
  ggml_compute_forward_log_f32(params, dst);
5726
  } break;
 
 
 
 
5727
  default:
5728
  {
5729
  GGML_ABORT("fatal error");
@@ -5758,6 +5965,31 @@ static void ggml_compute_forward_sin_f32(
5758
  }
5759
  }
5760
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5761
  static void ggml_compute_forward_sin(
5762
  const struct ggml_compute_params * params,
5763
  struct ggml_tensor * dst) {
@@ -5769,6 +6001,10 @@ static void ggml_compute_forward_sin(
5769
  {
5770
  ggml_compute_forward_sin_f32(params, dst);
5771
  } break;
 
 
 
 
5772
  default:
5773
  {
5774
  GGML_ABORT("fatal error");
@@ -5803,6 +6039,31 @@ static void ggml_compute_forward_cos_f32(
5803
  }
5804
  }
5805
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5806
  static void ggml_compute_forward_cos(
5807
  const struct ggml_compute_params * params,
5808
  struct ggml_tensor * dst) {
@@ -5814,6 +6075,10 @@ static void ggml_compute_forward_cos(
5814
  {
5815
  ggml_compute_forward_cos_f32(params, dst);
5816
  } break;
 
 
 
 
5817
  default:
5818
  {
5819
  GGML_ABORT("fatal error");
@@ -6471,6 +6736,30 @@ static void ggml_compute_forward_abs_f32(
6471
  }
6472
  }
6473
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6474
  static void ggml_compute_forward_abs(
6475
  const struct ggml_compute_params * params,
6476
  struct ggml_tensor * dst) {
@@ -6482,6 +6771,10 @@ static void ggml_compute_forward_abs(
6482
  {
6483
  ggml_compute_forward_abs_f32(params, dst);
6484
  } break;
 
 
 
 
6485
  default:
6486
  {
6487
  GGML_ABORT("fatal error");
@@ -6515,6 +6808,30 @@ static void ggml_compute_forward_sgn_f32(
6515
  }
6516
  }
6517
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6518
  static void ggml_compute_forward_sgn(
6519
  const struct ggml_compute_params * params,
6520
  struct ggml_tensor * dst) {
@@ -6526,6 +6843,10 @@ static void ggml_compute_forward_sgn(
6526
  {
6527
  ggml_compute_forward_sgn_f32(params, dst);
6528
  } break;
 
 
 
 
6529
  default:
6530
  {
6531
  GGML_ABORT("fatal error");
@@ -6559,6 +6880,30 @@ static void ggml_compute_forward_neg_f32(
6559
  }
6560
  }
6561
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6562
  static void ggml_compute_forward_neg(
6563
  const struct ggml_compute_params * params,
6564
  struct ggml_tensor * dst) {
@@ -6570,6 +6915,10 @@ static void ggml_compute_forward_neg(
6570
  {
6571
  ggml_compute_forward_neg_f32(params, dst);
6572
  } break;
 
 
 
 
6573
  default:
6574
  {
6575
  GGML_ABORT("fatal error");
@@ -6603,9 +6952,33 @@ static void ggml_compute_forward_step_f32(
6603
  }
6604
  }
6605
 
6606
- static void ggml_compute_forward_step(
6607
- const struct ggml_compute_params * params,
6608
- struct ggml_tensor * dst) {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6609
 
6610
  const struct ggml_tensor * src0 = dst->src[0];
6611
 
@@ -6614,6 +6987,10 @@ static void ggml_compute_forward_step(
6614
  {
6615
  ggml_compute_forward_step_f32(params, dst);
6616
  } break;
 
 
 
 
6617
  default:
6618
  {
6619
  GGML_ABORT("fatal error");
@@ -6647,6 +7024,30 @@ static void ggml_compute_forward_tanh_f32(
6647
  }
6648
  }
6649
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6650
  static void ggml_compute_forward_tanh(
6651
  const struct ggml_compute_params * params,
6652
  struct ggml_tensor * dst) {
@@ -6658,6 +7059,10 @@ static void ggml_compute_forward_tanh(
6658
  {
6659
  ggml_compute_forward_tanh_f32(params, dst);
6660
  } break;
 
 
 
 
6661
  default:
6662
  {
6663
  GGML_ABORT("fatal error");
@@ -6691,6 +7096,30 @@ static void ggml_compute_forward_elu_f32(
6691
  }
6692
  }
6693
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6694
  static void ggml_compute_forward_elu(
6695
  const struct ggml_compute_params * params,
6696
  struct ggml_tensor * dst) {
@@ -6702,6 +7131,10 @@ static void ggml_compute_forward_elu(
6702
  {
6703
  ggml_compute_forward_elu_f32(params, dst);
6704
  } break;
 
 
 
 
6705
  default:
6706
  {
6707
  GGML_ABORT("fatal error");
@@ -6735,6 +7168,30 @@ static void ggml_compute_forward_relu_f32(
6735
  }
6736
  }
6737
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6738
  static void ggml_compute_forward_relu(
6739
  const struct ggml_compute_params * params,
6740
  struct ggml_tensor * dst) {
@@ -6746,6 +7203,10 @@ static void ggml_compute_forward_relu(
6746
  {
6747
  ggml_compute_forward_relu_f32(params, dst);
6748
  } break;
 
 
 
 
6749
  default:
6750
  {
6751
  GGML_ABORT("fatal error");
@@ -6779,6 +7240,30 @@ static void ggml_compute_forward_sigmoid_f32(
6779
  }
6780
  }
6781
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6782
  static void ggml_compute_forward_sigmoid(
6783
  const struct ggml_compute_params * params,
6784
  struct ggml_tensor * dst) {
@@ -6790,6 +7275,10 @@ static void ggml_compute_forward_sigmoid(
6790
  {
6791
  ggml_compute_forward_sigmoid_f32(params, dst);
6792
  } break;
 
 
 
 
6793
  default:
6794
  {
6795
  GGML_ABORT("fatal error");
@@ -6838,6 +7327,46 @@ static void ggml_compute_forward_gelu_f32(
6838
  }
6839
  }
6840
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6841
  static void ggml_compute_forward_gelu(
6842
  const struct ggml_compute_params * params,
6843
  struct ggml_tensor * dst) {
@@ -6849,6 +7378,10 @@ static void ggml_compute_forward_gelu(
6849
  {
6850
  ggml_compute_forward_gelu_f32(params, dst);
6851
  } break;
 
 
 
 
6852
  default:
6853
  {
6854
  GGML_ABORT("fatal error");
@@ -6897,6 +7430,46 @@ static void ggml_compute_forward_gelu_quick_f32(
6897
  }
6898
  }
6899
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6900
  static void ggml_compute_forward_gelu_quick(
6901
  const struct ggml_compute_params * params,
6902
  struct ggml_tensor * dst) {
@@ -6908,6 +7481,10 @@ static void ggml_compute_forward_gelu_quick(
6908
  {
6909
  ggml_compute_forward_gelu_quick_f32(params, dst);
6910
  } break;
 
 
 
 
6911
  default:
6912
  {
6913
  GGML_ABORT("fatal error");
@@ -6956,6 +7533,46 @@ static void ggml_compute_forward_silu_f32(
6956
  }
6957
  }
6958
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6959
  static void ggml_compute_forward_silu(
6960
  const struct ggml_compute_params * params,
6961
  struct ggml_tensor * dst) {
@@ -6967,6 +7584,10 @@ static void ggml_compute_forward_silu(
6967
  {
6968
  ggml_compute_forward_silu_f32(params, dst);
6969
  } break;
 
 
 
 
6970
  default:
6971
  {
6972
  GGML_ABORT("fatal error");
@@ -7005,6 +7626,36 @@ static void ggml_compute_forward_leaky_relu_f32(
7005
  }
7006
  }
7007
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7008
  static void ggml_compute_forward_leaky_relu(
7009
  const struct ggml_compute_params * params,
7010
  struct ggml_tensor * dst) {
@@ -7016,6 +7667,10 @@ static void ggml_compute_forward_leaky_relu(
7016
  {
7017
  ggml_compute_forward_leaky_relu_f32(params, dst);
7018
  } break;
 
 
 
 
7019
  default:
7020
  {
7021
  GGML_ABORT("fatal error");
@@ -7068,6 +7723,50 @@ static void ggml_compute_forward_silu_back_f32(
7068
  }
7069
  }
7070
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7071
  static void ggml_compute_forward_silu_back(
7072
  const struct ggml_compute_params * params,
7073
  struct ggml_tensor * dst) {
@@ -7079,6 +7778,10 @@ static void ggml_compute_forward_silu_back(
7079
  {
7080
  ggml_compute_forward_silu_back_f32(params, dst);
7081
  } break;
 
 
 
 
7082
  default:
7083
  {
7084
  GGML_ABORT("fatal error");
@@ -7086,7 +7789,6 @@ static void ggml_compute_forward_silu_back(
7086
  }
7087
  }
7088
 
7089
-
7090
  static void ggml_compute_forward_hardswish_f32(
7091
  const struct ggml_compute_params * params,
7092
  struct ggml_tensor * dst) {
@@ -7110,6 +7812,31 @@ static void ggml_compute_forward_hardswish_f32(
7110
  (float *) ((char *) src0->data + i*(src0->nb[1])));
7111
  }
7112
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7113
  static void ggml_compute_forward_hardswish(
7114
  const struct ggml_compute_params * params,
7115
  struct ggml_tensor * dst) {
@@ -7121,6 +7848,10 @@ static void ggml_compute_forward_hardswish(
7121
  {
7122
  ggml_compute_forward_hardswish_f32(params, dst);
7123
  } break;
 
 
 
 
7124
  default:
7125
  {
7126
  GGML_ABORT("fatal error");
@@ -7152,6 +7883,30 @@ static void ggml_compute_forward_hardsigmoid_f32(
7152
  }
7153
  }
7154
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7155
  static void ggml_compute_forward_hardsigmoid(
7156
  const struct ggml_compute_params * params,
7157
  struct ggml_tensor * dst) {
@@ -7163,6 +7918,10 @@ static void ggml_compute_forward_hardsigmoid(
7163
  {
7164
  ggml_compute_forward_hardsigmoid_f32(params, dst);
7165
  } break;
 
 
 
 
7166
  default:
7167
  {
7168
  GGML_ABORT("fatal error");
@@ -7194,6 +7953,30 @@ static void ggml_compute_forward_exp_f32(
7194
  }
7195
  }
7196
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7197
  static void ggml_compute_forward_exp(
7198
  const struct ggml_compute_params * params,
7199
  struct ggml_tensor * dst) {
@@ -7205,6 +7988,10 @@ static void ggml_compute_forward_exp(
7205
  {
7206
  ggml_compute_forward_exp_f32(params, dst);
7207
  } break;
 
 
 
 
7208
  default:
7209
  {
7210
  GGML_ABORT("fatal error");
@@ -9489,6 +10276,43 @@ static void ggml_compute_forward_clamp_f32(
9489
  }
9490
  }
9491
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9492
  static void ggml_compute_forward_clamp(
9493
  const struct ggml_compute_params * params,
9494
  struct ggml_tensor * dst) {
@@ -9501,6 +10325,9 @@ static void ggml_compute_forward_clamp(
9501
  ggml_compute_forward_clamp_f32(params, dst);
9502
  } break;
9503
  case GGML_TYPE_F16:
 
 
 
9504
  case GGML_TYPE_BF16:
9505
  case GGML_TYPE_Q4_0:
9506
  case GGML_TYPE_Q4_1:
 
1432
  inline static void ggml_vec_set_f32 (const int n, float * x, const float v) { for (int i = 0; i < n; ++i) x[i] = v; }
1433
  inline static void ggml_vec_cpy_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i]; }
1434
  inline static void ggml_vec_neg_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = -x[i]; }
1435
+ inline static void ggml_vec_neg_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
1436
+ for (int i = 0; i < n; ++i) {
1437
+ y[i] = GGML_FP32_TO_FP16(-GGML_FP16_TO_FP32(x[i]));
1438
+ }
1439
+ }
1440
+
1441
  inline static void ggml_vec_mul_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i]*y[i]; }
1442
  inline static void ggml_vec_mul_f16 (const int n, ggml_fp16_t * z, const ggml_fp16_t * x, const ggml_fp16_t * y) {
1443
  for (int i = 0; i < n; ++i) {
 
1836
 
1837
  inline static void ggml_vec_norm_f32 (const int n, float * s, const float * x) { ggml_vec_dot_f32(n, s, 0, x, 0, x, 0, 1); *s = sqrtf(*s); }
1838
  inline static void ggml_vec_sqr_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i]*x[i]; }
1839
+ inline static void ggml_vec_sqr_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
1840
+ for (int i = 0; i < n; ++i) {
1841
+ float v = GGML_FP16_TO_FP32(x[i]);
1842
+ y[i] = GGML_FP32_TO_FP16(v*v);
1843
+ }
1844
+ }
1845
  inline static void ggml_vec_sqrt_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = sqrtf(x[i]); }
1846
+ inline static void ggml_vec_sqrt_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
1847
+ for (int i = 0; i < n; ++i) {
1848
+ y[i] = GGML_FP32_TO_FP16(sqrtf(GGML_FP16_TO_FP32(x[i])));
1849
+ }
1850
+ }
1851
  inline static void ggml_vec_log_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = logf(x[i]); }
1852
+ inline static void ggml_vec_log_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
1853
+ for (int i = 0; i < n; ++i) {
1854
+ y[i] = GGML_FP32_TO_FP16(logf(GGML_FP16_TO_FP32(x[i])));
1855
+ }
1856
+ }
1857
  inline static void ggml_vec_sin_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = sinf(x[i]); }
1858
+ inline static void ggml_vec_sin_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
1859
+ for (int i = 0; i < n; ++i) {
1860
+ y[i] = GGML_FP32_TO_FP16(sinf(GGML_FP16_TO_FP32(x[i])));
1861
+ }
1862
+ }
1863
  inline static void ggml_vec_cos_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = cosf(x[i]); }
1864
+ inline static void ggml_vec_cos_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
1865
+ for (int i = 0; i < n; ++i) {
1866
+ y[i] = GGML_FP32_TO_FP16(cosf(GGML_FP16_TO_FP32(x[i])));
1867
+ }
1868
+ }
1869
  inline static void ggml_vec_abs_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = fabsf(x[i]); }
1870
+ inline static void ggml_vec_abs_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
1871
+ for (int i = 0; i < n; ++i) {
1872
+ y[i] = GGML_FP32_TO_FP16(fabsf(GGML_FP16_TO_FP32(x[i])));
1873
+ }
1874
+ }
1875
  inline static void ggml_vec_sgn_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? 1.f : ((x[i] < 0.f) ? -1.f : 0.f); }
1876
+ inline static void ggml_vec_sgn_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
1877
+ for (int i = 0; i < n; ++i) {
1878
+ float v = GGML_FP16_TO_FP32(x[i]);
1879
+ y[i] = GGML_FP32_TO_FP16((v > 0.f) ? 1.f : ((v < 0.f) ? -1.f : 0.f));
1880
+ }
1881
+ }
1882
  inline static void ggml_vec_step_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? 1.f : 0.f; }
1883
+ inline static void ggml_vec_step_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
1884
+ for (int i = 0; i < n; ++i) {
1885
+ y[i] = GGML_FP32_TO_FP16((GGML_FP16_TO_FP32(x[i]) > 0.f) ? 1.f : 0.f);
1886
+ }
1887
+ }
1888
  inline static void ggml_vec_tanh_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = tanhf(x[i]); }
1889
+ inline static void ggml_vec_tanh_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
1890
+ for (int i = 0; i < n; ++i) {
1891
+ y[i] = GGML_FP32_TO_FP16(tanhf(GGML_FP16_TO_FP32(x[i])));
1892
+ }
1893
+ }
1894
  inline static void ggml_vec_elu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : expm1f(x[i]); }
1895
+ inline static void ggml_vec_elu_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
1896
+ for (int i = 0; i < n; ++i) {
1897
+ y[i] = GGML_FP32_TO_FP16(expm1f(GGML_FP16_TO_FP32(x[i])));
1898
+ }
1899
+ }
1900
  inline static void ggml_vec_relu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.f; }
1901
+ inline static void ggml_vec_relu_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
1902
+ for (int i = 0; i < n; ++i) {
1903
+ float v = GGML_FP16_TO_FP32(x[i]);
1904
+ y[i] = GGML_FP32_TO_FP16((v > 0.f) ? v : 0.f);
1905
+ }
1906
+ }
1907
  inline static void ggml_vec_leaky_relu_f32 (const int n, float * y, const float * x, const float ns) { for (int i = 0; i < n; ++i) y[i] = ((x[i] > 0.f) ? x[i] : 0.f) + ns * ((x[i] < 0.0f) ? x[i] : 0.f); }
1908
+ inline static void ggml_vec_leaky_relu_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x, const float ns) {
1909
+ for (int i = 0; i < n; ++i) {
1910
+ float v = GGML_FP16_TO_FP32(x[i]);
1911
+ y[i] = GGML_FP32_TO_FP16(((v > 0.f) ? v : 0.f) + ns * ((v < 0.0f) ? v : 0.f));
1912
+ }
1913
+ }
1914
  inline static void ggml_vec_sigmoid_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = 1.f / (1.f + expf(-x[i])); }
1915
+ inline static void ggml_vec_sigmoid_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
1916
+ for (int i = 0; i < n; ++i) {
1917
+ y[i] = GGML_FP32_TO_FP16(1.f / (1.f + expf(-GGML_FP16_TO_FP32(x[i]))));
1918
+ }
1919
+ }
1920
  // TODO: optimize performance
1921
  inline static void ggml_vec_hardswish_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i] * fminf(1.0f, fmaxf(0.0f, (x[i] + 3.0f) / 6.0f)); }
1922
+ inline static void ggml_vec_hardswish_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
1923
+ for (int i = 0; i < n; ++i) {
1924
+ float v = GGML_FP16_TO_FP32(x[i]);
1925
+ y[i] = GGML_FP32_TO_FP16(v * fminf(1.0f, fmaxf(0.0f, (v + 3.0f) / 6.0f)));
1926
+ }
1927
+ }
1928
  inline static void ggml_vec_hardsigmoid_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = fminf(1.0f, fmaxf(0.0f, (x[i] + 3.0f) / 6.0f)); }
1929
+ inline static void ggml_vec_hardsigmoid_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
1930
+ for (int i = 0; i < n; ++i) {
1931
+ y[i] = GGML_FP32_TO_FP16(fminf(1.0f, fmaxf(0.0f, (GGML_FP16_TO_FP32(x[i]) + 3.0f) / 6.0f)));
1932
+ }
1933
+ }
1934
  inline static void ggml_vec_exp_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = expf(x[i]); }
1935
+ inline static void ggml_vec_exp_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
1936
+ for (int i = 0; i < n; ++i) {
1937
+ y[i] = GGML_FP32_TO_FP16(expf(GGML_FP16_TO_FP32(x[i])));
1938
+ }
1939
+ }
1940
 
1941
  static const float GELU_COEF_A = 0.044715f;
1942
  static const float GELU_QUICK_COEF = -1.702f;
 
2004
  }
2005
  #endif
2006
 
2007
+ inline static void ggml_vec_gelu_quick_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
2008
+ for (int i = 0; i < n; ++i) {
2009
+ float v = GGML_FP16_TO_FP32(x[i]);
2010
+ y[i] = GGML_FP32_TO_FP16(v*(1.0f/(1.0f+expf(GELU_QUICK_COEF*v))));
2011
+ }
2012
+ }
2013
+
2014
  // Sigmoid Linear Unit (SiLU) function
2015
  inline static float ggml_silu_f32(float x) {
2016
  return x/(1.0f + expf(-x));
2017
  }
2018
+ inline static ggml_fp16_t ggml_silu_f16(ggml_fp16_t x) {
2019
+ float v = GGML_FP16_TO_FP32(x);
2020
+ return GGML_FP32_TO_FP16(v/(1.0f + expf(-v)));
2021
+ }
2022
 
2023
  #if __FINITE_MATH_ONLY__
2024
  #error "some routines in ggml.c require non-finite math arithmetics -- pass -fno-finite-math-only to the compiler to fix"
 
2242
  }
2243
  }
2244
 
2245
+ inline static void ggml_vec_silu_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
2246
+ for (int i = 0; i < n; ++i) {
2247
+ y[i] = ggml_silu_f16(x[i]);
2248
+ }
2249
+ }
2250
+
2251
  static ggml_float ggml_vec_soft_max_f32(const int n, float * y, const float * x, float max) {
2252
  int i = 0;
2253
  ggml_float sum = 0;
 
2319
  return dy*s*(1.0f + x*(1.0f - s));
2320
  }
2321
 
2322
+ inline static ggml_fp16_t ggml_silu_backward_f16(ggml_fp16_t x, ggml_fp16_t dy) {
2323
+ const float v = GGML_FP16_TO_FP32(x);
2324
+ const float s = 1.0f/(1.0f + expf(-v));
2325
+ return GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(dy)*s*(1.0f + v*(1.0f - s)));
2326
+ }
2327
+
2328
  inline static void ggml_vec_silu_backward_f32(const int n, float * dx, const float * x, const float * dy) {
2329
  for (int i = 0; i < n; ++i) {
2330
  dx[i] = ggml_silu_backward_f32(x[i], dy[i]);
2331
  }
2332
  }
2333
 
2334
+ inline static void ggml_vec_silu_backward_f16(const int n, ggml_fp16_t * dx, const ggml_fp16_t * x, const ggml_fp16_t * dy) {
2335
+ for (int i = 0; i < n; ++i) {
2336
+ dx[i] = ggml_silu_backward_f16(x[i], dy[i]);
2337
+ }
2338
+ }
2339
+
2340
  inline static void ggml_vec_sum_f32(const int n, float * s, const float * x) {
2341
  #ifndef GGML_USE_ACCELERATE
2342
  ggml_float sum = 0.0;
 
5743
  }
5744
  }
5745
 
5746
+ static void ggml_compute_forward_sqr_f16(
5747
+ const struct ggml_compute_params * params,
5748
+ struct ggml_tensor * dst) {
5749
+
5750
+ const struct ggml_tensor * src0 = dst->src[0];
5751
+
5752
+ if (params->ith != 0) {
5753
+ return;
5754
+ }
5755
+
5756
+ assert(ggml_are_same_shape(src0, dst));
5757
+
5758
+ const int n = ggml_nrows(src0);
5759
+ const int nc = src0->ne[0];
5760
+
5761
+ assert( dst->nb[0] == sizeof(ggml_fp16_t));
5762
+ assert(src0->nb[0] == sizeof(ggml_fp16_t));
5763
+
5764
+ for (int i = 0; i < n; i++) {
5765
+ ggml_vec_sqr_f16(nc,
5766
+ (ggml_fp16_t *) ((char *) dst->data + i*( dst->nb[1])),
5767
+ (ggml_fp16_t *) ((char *) src0->data + i*(src0->nb[1])));
5768
+ }
5769
+ }
5770
+
5771
  static void ggml_compute_forward_sqr(
5772
  const struct ggml_compute_params * params,
5773
  struct ggml_tensor * dst) {
 
5779
  {
5780
  ggml_compute_forward_sqr_f32(params, dst);
5781
  } break;
5782
+ case GGML_TYPE_F16:
5783
+ {
5784
+ ggml_compute_forward_sqr_f16(params, dst);
5785
+ } break;
5786
  default:
5787
  {
5788
  GGML_ABORT("fatal error");
 
5817
  }
5818
  }
5819
 
5820
+ static void ggml_compute_forward_sqrt_f16(
5821
+ const struct ggml_compute_params * params,
5822
+ struct ggml_tensor * dst) {
5823
+
5824
+ const struct ggml_tensor * src0 = dst->src[0];
5825
+
5826
+ if (params->ith != 0) {
5827
+ return;
5828
+ }
5829
+
5830
+ assert(ggml_are_same_shape(src0, dst));
5831
+
5832
+ const int n = ggml_nrows(src0);
5833
+ const int nc = src0->ne[0];
5834
+
5835
+ assert( dst->nb[0] == sizeof(ggml_fp16_t));
5836
+ assert(src0->nb[0] == sizeof(ggml_fp16_t));
5837
+
5838
+ for (int i = 0; i < n; i++) {
5839
+ ggml_vec_sqrt_f16(nc,
5840
+ (ggml_fp16_t *) ((char *) dst->data + i*( dst->nb[1])),
5841
+ (ggml_fp16_t *) ((char *) src0->data + i*(src0->nb[1])));
5842
+ }
5843
+ }
5844
+
5845
  static void ggml_compute_forward_sqrt(
5846
  const struct ggml_compute_params * params,
5847
  struct ggml_tensor * dst) {
 
5853
  {
5854
  ggml_compute_forward_sqrt_f32(params, dst);
5855
  } break;
5856
+ case GGML_TYPE_F16:
5857
+ {
5858
+ ggml_compute_forward_sqrt_f16(params, dst);
5859
+ } break;
5860
  default:
5861
  {
5862
  GGML_ABORT("fatal error");
 
5891
  }
5892
  }
5893
 
5894
+ static void ggml_compute_forward_log_f16(
5895
+ const struct ggml_compute_params * params,
5896
+ struct ggml_tensor * dst) {
5897
+
5898
+ const struct ggml_tensor * src0 = dst->src[0];
5899
+
5900
+ if (params->ith != 0) {
5901
+ return;
5902
+ }
5903
+
5904
+ GGML_ASSERT(ggml_are_same_shape(src0, dst));
5905
+
5906
+ const int n = ggml_nrows(src0);
5907
+ const int nc = src0->ne[0];
5908
+
5909
+ GGML_ASSERT( dst->nb[0] == sizeof(ggml_fp16_t));
5910
+ GGML_ASSERT(src0->nb[0] == sizeof(ggml_fp16_t));
5911
+
5912
+ for (int i = 0; i < n; i++) {
5913
+ ggml_vec_log_f16(nc,
5914
+ (ggml_fp16_t *) ((char *) dst->data + i*( dst->nb[1])),
5915
+ (ggml_fp16_t *) ((char *) src0->data + i*(src0->nb[1])));
5916
+ }
5917
+ }
5918
+
5919
  static void ggml_compute_forward_log(
5920
  const struct ggml_compute_params * params,
5921
  struct ggml_tensor * dst) {
 
5927
  {
5928
  ggml_compute_forward_log_f32(params, dst);
5929
  } break;
5930
+ case GGML_TYPE_F16:
5931
+ {
5932
+ ggml_compute_forward_log_f16(params, dst);
5933
+ } break;
5934
  default:
5935
  {
5936
  GGML_ABORT("fatal error");
 
5965
  }
5966
  }
5967
 
5968
+ static void ggml_compute_forward_sin_f16(
5969
+ const struct ggml_compute_params * params,
5970
+ struct ggml_tensor * dst) {
5971
+
5972
+ const struct ggml_tensor * src0 = dst->src[0];
5973
+
5974
+ if (params->ith != 0) {
5975
+ return;
5976
+ }
5977
+
5978
+ GGML_ASSERT(ggml_are_same_shape(src0, dst));
5979
+
5980
+ const int n = ggml_nrows(src0);
5981
+ const int nc = src0->ne[0];
5982
+
5983
+ GGML_ASSERT( dst->nb[0] == sizeof(ggml_fp16_t));
5984
+ GGML_ASSERT(src0->nb[0] == sizeof(ggml_fp16_t));
5985
+
5986
+ for (int i = 0; i < n; i++) {
5987
+ ggml_vec_sin_f16(nc,
5988
+ (ggml_fp16_t *) ((char *) dst->data + i*( dst->nb[1])),
5989
+ (ggml_fp16_t *) ((char *) src0->data + i*(src0->nb[1])));
5990
+ }
5991
+ }
5992
+
5993
  static void ggml_compute_forward_sin(
5994
  const struct ggml_compute_params * params,
5995
  struct ggml_tensor * dst) {
 
6001
  {
6002
  ggml_compute_forward_sin_f32(params, dst);
6003
  } break;
6004
+ case GGML_TYPE_F16:
6005
+ {
6006
+ ggml_compute_forward_sin_f16(params, dst);
6007
+ } break;
6008
  default:
6009
  {
6010
  GGML_ABORT("fatal error");
 
6039
  }
6040
  }
6041
 
6042
+ static void ggml_compute_forward_cos_f16(
6043
+ const struct ggml_compute_params * params,
6044
+ struct ggml_tensor * dst) {
6045
+
6046
+ const struct ggml_tensor * src0 = dst->src[0];
6047
+
6048
+ if (params->ith != 0) {
6049
+ return;
6050
+ }
6051
+
6052
+ GGML_ASSERT(ggml_are_same_shape(src0, dst));
6053
+
6054
+ const int n = ggml_nrows(src0);
6055
+ const int nc = src0->ne[0];
6056
+
6057
+ GGML_ASSERT( dst->nb[0] == sizeof(ggml_fp16_t));
6058
+ GGML_ASSERT(src0->nb[0] == sizeof(ggml_fp16_t));
6059
+
6060
+ for (int i = 0; i < n; i++) {
6061
+ ggml_vec_cos_f16(nc,
6062
+ (ggml_fp16_t *) ((char *) dst->data + i*( dst->nb[1])),
6063
+ (ggml_fp16_t *) ((char *) src0->data + i*(src0->nb[1])));
6064
+ }
6065
+ }
6066
+
6067
  static void ggml_compute_forward_cos(
6068
  const struct ggml_compute_params * params,
6069
  struct ggml_tensor * dst) {
 
6075
  {
6076
  ggml_compute_forward_cos_f32(params, dst);
6077
  } break;
6078
+ case GGML_TYPE_F16:
6079
+ {
6080
+ ggml_compute_forward_cos_f16(params, dst);
6081
+ } break;
6082
  default:
6083
  {
6084
  GGML_ABORT("fatal error");
 
6736
  }
6737
  }
6738
 
6739
+ static void ggml_compute_forward_abs_f16(
6740
+ const struct ggml_compute_params * params,
6741
+ struct ggml_tensor * dst) {
6742
+
6743
+ const struct ggml_tensor * src0 = dst->src[0];
6744
+
6745
+ if (params->ith != 0) {
6746
+ return;
6747
+ }
6748
+
6749
+ assert(ggml_is_contiguous_1(src0));
6750
+ assert(ggml_is_contiguous_1(dst));
6751
+ assert(ggml_are_same_shape(src0, dst));
6752
+
6753
+ const int n = ggml_nrows(src0);
6754
+ const int nc = src0->ne[0];
6755
+
6756
+ for (int i = 0; i < n; i++) {
6757
+ ggml_vec_abs_f16(nc,
6758
+ (ggml_fp16_t *) ((char *) dst->data + i*( dst->nb[1])),
6759
+ (ggml_fp16_t *) ((char *) src0->data + i*(src0->nb[1])));
6760
+ }
6761
+ }
6762
+
6763
  static void ggml_compute_forward_abs(
6764
  const struct ggml_compute_params * params,
6765
  struct ggml_tensor * dst) {
 
6771
  {
6772
  ggml_compute_forward_abs_f32(params, dst);
6773
  } break;
6774
+ case GGML_TYPE_F16:
6775
+ {
6776
+ ggml_compute_forward_abs_f16(params, dst);
6777
+ } break;
6778
  default:
6779
  {
6780
  GGML_ABORT("fatal error");
 
6808
  }
6809
  }
6810
 
6811
+ static void ggml_compute_forward_sgn_f16(
6812
+ const struct ggml_compute_params * params,
6813
+ struct ggml_tensor * dst) {
6814
+
6815
+ const struct ggml_tensor * src0 = dst->src[0];
6816
+
6817
+ if (params->ith != 0) {
6818
+ return;
6819
+ }
6820
+
6821
+ assert(ggml_is_contiguous_1(src0));
6822
+ assert(ggml_is_contiguous_1(dst));
6823
+ assert(ggml_are_same_shape(src0, dst));
6824
+
6825
+ const int n = ggml_nrows(src0);
6826
+ const int nc = src0->ne[0];
6827
+
6828
+ for (int i = 0; i < n; i++) {
6829
+ ggml_vec_sgn_f16(nc,
6830
+ (ggml_fp16_t *) ((char *) dst->data + i*( dst->nb[1])),
6831
+ (ggml_fp16_t *) ((char *) src0->data + i*(src0->nb[1])));
6832
+ }
6833
+ }
6834
+
6835
  static void ggml_compute_forward_sgn(
6836
  const struct ggml_compute_params * params,
6837
  struct ggml_tensor * dst) {
 
6843
  {
6844
  ggml_compute_forward_sgn_f32(params, dst);
6845
  } break;
6846
+ case GGML_TYPE_F16:
6847
+ {
6848
+ ggml_compute_forward_sgn_f16(params, dst);
6849
+ } break;
6850
  default:
6851
  {
6852
  GGML_ABORT("fatal error");
 
6880
  }
6881
  }
6882
 
6883
+ static void ggml_compute_forward_neg_f16(
6884
+ const struct ggml_compute_params * params,
6885
+ struct ggml_tensor * dst) {
6886
+
6887
+ const struct ggml_tensor * src0 = dst->src[0];
6888
+
6889
+ if (params->ith != 0) {
6890
+ return;
6891
+ }
6892
+
6893
+ assert(ggml_is_contiguous_1(src0));
6894
+ assert(ggml_is_contiguous_1(dst));
6895
+ assert(ggml_are_same_shape(src0, dst));
6896
+
6897
+ const int n = ggml_nrows(src0);
6898
+ const int nc = src0->ne[0];
6899
+
6900
+ for (int i = 0; i < n; i++) {
6901
+ ggml_vec_neg_f16(nc,
6902
+ (ggml_fp16_t *) ((char *) dst->data + i*( dst->nb[1])),
6903
+ (ggml_fp16_t *) ((char *) src0->data + i*(src0->nb[1])));
6904
+ }
6905
+ }
6906
+
6907
  static void ggml_compute_forward_neg(
6908
  const struct ggml_compute_params * params,
6909
  struct ggml_tensor * dst) {
 
6915
  {
6916
  ggml_compute_forward_neg_f32(params, dst);
6917
  } break;
6918
+ case GGML_TYPE_F16:
6919
+ {
6920
+ ggml_compute_forward_neg_f16(params, dst);
6921
+ } break;
6922
  default:
6923
  {
6924
  GGML_ABORT("fatal error");
 
6952
  }
6953
  }
6954
 
6955
+ static void ggml_compute_forward_step_f16(
6956
+ const struct ggml_compute_params * params,
6957
+ struct ggml_tensor * dst) {
6958
+
6959
+ const struct ggml_tensor * src0 = dst->src[0];
6960
+
6961
+ if (params->ith != 0) {
6962
+ return;
6963
+ }
6964
+
6965
+ assert(ggml_is_contiguous_1(src0));
6966
+ assert(ggml_is_contiguous_1(dst));
6967
+ assert(ggml_are_same_shape(src0, dst));
6968
+
6969
+ const int n = ggml_nrows(src0);
6970
+ const int nc = src0->ne[0];
6971
+
6972
+ for (int i = 0; i < n; i++) {
6973
+ ggml_vec_step_f16(nc,
6974
+ (ggml_fp16_t *) ((char *) dst->data + i*( dst->nb[1])),
6975
+ (ggml_fp16_t *) ((char *) src0->data + i*(src0->nb[1])));
6976
+ }
6977
+ }
6978
+
6979
+ static void ggml_compute_forward_step(
6980
+ const struct ggml_compute_params * params,
6981
+ struct ggml_tensor * dst) {
6982
 
6983
  const struct ggml_tensor * src0 = dst->src[0];
6984
 
 
6987
  {
6988
  ggml_compute_forward_step_f32(params, dst);
6989
  } break;
6990
+ case GGML_TYPE_F16:
6991
+ {
6992
+ ggml_compute_forward_step_f16(params, dst);
6993
+ } break;
6994
  default:
6995
  {
6996
  GGML_ABORT("fatal error");
 
7024
  }
7025
  }
7026
 
7027
+ static void ggml_compute_forward_tanh_f16(
7028
+ const struct ggml_compute_params * params,
7029
+ struct ggml_tensor * dst) {
7030
+
7031
+ const struct ggml_tensor * src0 = dst->src[0];
7032
+
7033
+ if (params->ith != 0) {
7034
+ return;
7035
+ }
7036
+
7037
+ assert(ggml_is_contiguous_1(src0));
7038
+ assert(ggml_is_contiguous_1(dst));
7039
+ assert(ggml_are_same_shape(src0, dst));
7040
+
7041
+ const int n = ggml_nrows(src0);
7042
+ const int nc = src0->ne[0];
7043
+
7044
+ for (int i = 0; i < n; i++) {
7045
+ ggml_vec_tanh_f16(nc,
7046
+ (ggml_fp16_t *) ((char *) dst->data + i*( dst->nb[1])),
7047
+ (ggml_fp16_t *) ((char *) src0->data + i*(src0->nb[1])));
7048
+ }
7049
+ }
7050
+
7051
  static void ggml_compute_forward_tanh(
7052
  const struct ggml_compute_params * params,
7053
  struct ggml_tensor * dst) {
 
7059
  {
7060
  ggml_compute_forward_tanh_f32(params, dst);
7061
  } break;
7062
+ case GGML_TYPE_F16:
7063
+ {
7064
+ ggml_compute_forward_tanh_f16(params, dst);
7065
+ } break;
7066
  default:
7067
  {
7068
  GGML_ABORT("fatal error");
 
7096
  }
7097
  }
7098
 
7099
+ static void ggml_compute_forward_elu_f16(
7100
+ const struct ggml_compute_params * params,
7101
+ struct ggml_tensor * dst) {
7102
+
7103
+ const struct ggml_tensor * src0 = dst->src[0];
7104
+
7105
+ if (params->ith != 0) {
7106
+ return;
7107
+ }
7108
+
7109
+ assert(ggml_is_contiguous_1(src0));
7110
+ assert(ggml_is_contiguous_1(dst));
7111
+ assert(ggml_are_same_shape(src0, dst));
7112
+
7113
+ const int n = ggml_nrows(src0);
7114
+ const int nc = src0->ne[0];
7115
+
7116
+ for (int i = 0; i < n; i++) {
7117
+ ggml_vec_elu_f16(nc,
7118
+ (ggml_fp16_t *) ((char *) dst->data + i*( dst->nb[1])),
7119
+ (ggml_fp16_t *) ((char *) src0->data + i*(src0->nb[1])));
7120
+ }
7121
+ }
7122
+
7123
  static void ggml_compute_forward_elu(
7124
  const struct ggml_compute_params * params,
7125
  struct ggml_tensor * dst) {
 
7131
  {
7132
  ggml_compute_forward_elu_f32(params, dst);
7133
  } break;
7134
+ case GGML_TYPE_F16:
7135
+ {
7136
+ ggml_compute_forward_elu_f16(params, dst);
7137
+ } break;
7138
  default:
7139
  {
7140
  GGML_ABORT("fatal error");
 
7168
  }
7169
  }
7170
 
7171
+ static void ggml_compute_forward_relu_f16(
7172
+ const struct ggml_compute_params * params,
7173
+ struct ggml_tensor * dst) {
7174
+
7175
+ const struct ggml_tensor * src0 = dst->src[0];
7176
+
7177
+ if (params->ith != 0) {
7178
+ return;
7179
+ }
7180
+
7181
+ assert(ggml_is_contiguous_1(src0));
7182
+ assert(ggml_is_contiguous_1(dst));
7183
+ assert(ggml_are_same_shape(src0, dst));
7184
+
7185
+ const int n = ggml_nrows(src0);
7186
+ const int nc = src0->ne[0];
7187
+
7188
+ for (int i = 0; i < n; i++) {
7189
+ ggml_vec_relu_f16(nc,
7190
+ (ggml_fp16_t *) ((char *) dst->data + i*( dst->nb[1])),
7191
+ (ggml_fp16_t *) ((char *) src0->data + i*(src0->nb[1])));
7192
+ }
7193
+ }
7194
+
7195
  static void ggml_compute_forward_relu(
7196
  const struct ggml_compute_params * params,
7197
  struct ggml_tensor * dst) {
 
7203
  {
7204
  ggml_compute_forward_relu_f32(params, dst);
7205
  } break;
7206
+ case GGML_TYPE_F16:
7207
+ {
7208
+ ggml_compute_forward_relu_f16(params, dst);
7209
+ } break;
7210
  default:
7211
  {
7212
  GGML_ABORT("fatal error");
 
7240
  }
7241
  }
7242
 
7243
+ static void ggml_compute_forward_sigmoid_f16(
7244
+ const struct ggml_compute_params * params,
7245
+ struct ggml_tensor * dst) {
7246
+
7247
+ const struct ggml_tensor * src0 = dst->src[0];
7248
+
7249
+ if (params->ith != 0) {
7250
+ return;
7251
+ }
7252
+
7253
+ assert(ggml_is_contiguous_1(src0));
7254
+ assert(ggml_is_contiguous_1(dst));
7255
+ assert(ggml_are_same_shape(src0, dst));
7256
+
7257
+ const int n = ggml_nrows(src0);
7258
+ const int nc = src0->ne[0];
7259
+
7260
+ for (int i = 0; i < n; i++) {
7261
+ ggml_vec_sigmoid_f16(nc,
7262
+ (ggml_fp16_t *) ((char *) dst->data + i*( dst->nb[1])),
7263
+ (ggml_fp16_t *) ((char *) src0->data + i*(src0->nb[1])));
7264
+ }
7265
+ }
7266
+
7267
  static void ggml_compute_forward_sigmoid(
7268
  const struct ggml_compute_params * params,
7269
  struct ggml_tensor * dst) {
 
7275
  {
7276
  ggml_compute_forward_sigmoid_f32(params, dst);
7277
  } break;
7278
+ case GGML_TYPE_F16:
7279
+ {
7280
+ ggml_compute_forward_sigmoid_f16(params, dst);
7281
+ } break;
7282
  default:
7283
  {
7284
  GGML_ABORT("fatal error");
 
7327
  }
7328
  }
7329
 
7330
+ static void ggml_compute_forward_gelu_f16(
7331
+ const struct ggml_compute_params * params,
7332
+ struct ggml_tensor * dst) {
7333
+
7334
+ const struct ggml_tensor * src0 = dst->src[0];
7335
+
7336
+ assert(ggml_is_contiguous_1(src0));
7337
+ assert(ggml_is_contiguous_1(dst));
7338
+ assert(ggml_are_same_shape(src0, dst));
7339
+
7340
+ const int ith = params->ith;
7341
+ const int nth = params->nth;
7342
+
7343
+ const int nc = src0->ne[0];
7344
+ const int nr = ggml_nrows(src0);
7345
+
7346
+ // rows per thread
7347
+ const int dr = (nr + nth - 1)/nth;
7348
+
7349
+ // row range for this thread
7350
+ const int ir0 = dr*ith;
7351
+ const int ir1 = MIN(ir0 + dr, nr);
7352
+
7353
+ for (int i1 = ir0; i1 < ir1; i1++) {
7354
+ ggml_vec_gelu_f16(nc,
7355
+ (ggml_fp16_t *) ((char *) dst->data + i1*( dst->nb[1])),
7356
+ (ggml_fp16_t *) ((char *) src0->data + i1*(src0->nb[1])));
7357
+
7358
+ #ifndef NDEBUG
7359
+ for (int k = 0; k < nc; k++) {
7360
+ const ggml_fp16_t x = ((ggml_fp16_t *) ((char *) dst->data + i1*( dst->nb[1])))[k];
7361
+ const float v = GGML_FP16_TO_FP32(x);
7362
+ UNUSED(v);
7363
+ assert(!isnan(v));
7364
+ assert(!isinf(v));
7365
+ }
7366
+ #endif
7367
+ }
7368
+ }
7369
+
7370
  static void ggml_compute_forward_gelu(
7371
  const struct ggml_compute_params * params,
7372
  struct ggml_tensor * dst) {
 
7378
  {
7379
  ggml_compute_forward_gelu_f32(params, dst);
7380
  } break;
7381
+ case GGML_TYPE_F16:
7382
+ {
7383
+ ggml_compute_forward_gelu_f16(params, dst);
7384
+ } break;
7385
  default:
7386
  {
7387
  GGML_ABORT("fatal error");
 
7430
  }
7431
  }
7432
 
7433
+ static void ggml_compute_forward_gelu_quick_f16(
7434
+ const struct ggml_compute_params * params,
7435
+ struct ggml_tensor * dst) {
7436
+
7437
+ const struct ggml_tensor * src0 = dst->src[0];
7438
+
7439
+ assert(ggml_is_contiguous_1(src0));
7440
+ assert(ggml_is_contiguous_1(dst));
7441
+ assert(ggml_are_same_shape(src0, dst));
7442
+
7443
+ const int ith = params->ith;
7444
+ const int nth = params->nth;
7445
+
7446
+ const int nc = src0->ne[0];
7447
+ const int nr = ggml_nrows(src0);
7448
+
7449
+ // rows per thread
7450
+ const int dr = (nr + nth - 1)/nth;
7451
+
7452
+ // row range for this thread
7453
+ const int ir0 = dr*ith;
7454
+ const int ir1 = MIN(ir0 + dr, nr);
7455
+
7456
+ for (int i1 = ir0; i1 < ir1; i1++) {
7457
+ ggml_vec_gelu_quick_f16(nc,
7458
+ (ggml_fp16_t *) ((char *) dst->data + i1*( dst->nb[1])),
7459
+ (ggml_fp16_t *) ((char *) src0->data + i1*(src0->nb[1])));
7460
+
7461
+ #ifndef NDEBUG
7462
+ for (int k = 0; k < nc; k++) {
7463
+ const ggml_fp16_t x = ((ggml_fp16_t *) ((char *) dst->data + i1*( dst->nb[1])))[k];
7464
+ const float v = GGML_FP16_TO_FP32(x);
7465
+ UNUSED(v);
7466
+ assert(!isnan(v));
7467
+ assert(!isinf(v));
7468
+ }
7469
+ #endif
7470
+ }
7471
+ }
7472
+
7473
  static void ggml_compute_forward_gelu_quick(
7474
  const struct ggml_compute_params * params,
7475
  struct ggml_tensor * dst) {
 
7481
  {
7482
  ggml_compute_forward_gelu_quick_f32(params, dst);
7483
  } break;
7484
+ case GGML_TYPE_F16:
7485
+ {
7486
+ ggml_compute_forward_gelu_quick_f16(params, dst);
7487
+ } break;
7488
  default:
7489
  {
7490
  GGML_ABORT("fatal error");
 
7533
  }
7534
  }
7535
 
7536
+ static void ggml_compute_forward_silu_f16(
7537
+ const struct ggml_compute_params * params,
7538
+ struct ggml_tensor * dst) {
7539
+
7540
+ const struct ggml_tensor * src0 = dst->src[0];
7541
+
7542
+ assert(ggml_is_contiguous_1(src0));
7543
+ assert(ggml_is_contiguous_1(dst));
7544
+ assert(ggml_are_same_shape(src0, dst));
7545
+
7546
+ const int ith = params->ith;
7547
+ const int nth = params->nth;
7548
+
7549
+ const int nc = src0->ne[0];
7550
+ const int nr = ggml_nrows(src0);
7551
+
7552
+ // rows per thread
7553
+ const int dr = (nr + nth - 1)/nth;
7554
+
7555
+ // row range for this thread
7556
+ const int ir0 = dr*ith;
7557
+ const int ir1 = MIN(ir0 + dr, nr);
7558
+
7559
+ for (int i1 = ir0; i1 < ir1; i1++) {
7560
+ ggml_vec_silu_f16(nc,
7561
+ (ggml_fp16_t *) ((char *) dst->data + i1*( dst->nb[1])),
7562
+ (ggml_fp16_t *) ((char *) src0->data + i1*(src0->nb[1])));
7563
+
7564
+ #ifndef NDEBUG
7565
+ for (int k = 0; k < nc; k++) {
7566
+ const ggml_fp16_t x = ((ggml_fp16_t *) ((char *) dst->data + i1*(dst->nb[1])))[k];
7567
+ const float v = GGML_FP16_TO_FP32(x);
7568
+ UNUSED(v);
7569
+ assert(!isnan(v));
7570
+ assert(!isinf(v));
7571
+ }
7572
+ #endif
7573
+ }
7574
+ }
7575
+
7576
  static void ggml_compute_forward_silu(
7577
  const struct ggml_compute_params * params,
7578
  struct ggml_tensor * dst) {
 
7584
  {
7585
  ggml_compute_forward_silu_f32(params, dst);
7586
  } break;
7587
+ case GGML_TYPE_F16:
7588
+ {
7589
+ ggml_compute_forward_silu_f16(params, dst);
7590
+ } break;
7591
  default:
7592
  {
7593
  GGML_ABORT("fatal error");
 
7626
  }
7627
  }
7628
 
7629
+ static void ggml_compute_forward_leaky_relu_f16(
7630
+ const struct ggml_compute_params * params,
7631
+ struct ggml_tensor * dst) {
7632
+
7633
+ const struct ggml_tensor * src0 = dst->src[0];
7634
+
7635
+ if (params->ith != 0) {
7636
+ return;
7637
+ }
7638
+
7639
+ assert(ggml_is_contiguous_1(src0));
7640
+ assert(ggml_is_contiguous_1(dst));
7641
+ assert(ggml_are_same_shape(src0, dst));
7642
+
7643
+ const int n = ggml_nrows(src0);
7644
+ const int nc = src0->ne[0];
7645
+
7646
+ float negative_slope;
7647
+ memcpy(&negative_slope, dst->op_params, sizeof(float));
7648
+
7649
+ assert(dst->nb[0] == sizeof(ggml_fp16_t));
7650
+ assert(src0->nb[0] == sizeof(ggml_fp16_t));
7651
+
7652
+ for (int i = 0; i < n; i++) {
7653
+ ggml_vec_leaky_relu_f16(nc,
7654
+ (ggml_fp16_t *) ((char *) dst->data + i*( dst->nb[1])),
7655
+ (ggml_fp16_t *) ((char *) src0->data + i*(src0->nb[1])), negative_slope);
7656
+ }
7657
+ }
7658
+
7659
  static void ggml_compute_forward_leaky_relu(
7660
  const struct ggml_compute_params * params,
7661
  struct ggml_tensor * dst) {
 
7667
  {
7668
  ggml_compute_forward_leaky_relu_f32(params, dst);
7669
  } break;
7670
+ case GGML_TYPE_F16:
7671
+ {
7672
+ ggml_compute_forward_leaky_relu_f16(params, dst);
7673
+ } break;
7674
  default:
7675
  {
7676
  GGML_ABORT("fatal error");
 
7723
  }
7724
  }
7725
 
7726
+ static void ggml_compute_forward_silu_back_f16(
7727
+ const struct ggml_compute_params * params,
7728
+ struct ggml_tensor * dst) {
7729
+
7730
+ const struct ggml_tensor * grad = dst->src[0];
7731
+ const struct ggml_tensor * src1 = dst->src[1];
7732
+
7733
+ assert(ggml_is_contiguous_1(grad));
7734
+ assert(ggml_is_contiguous_1(src1));
7735
+ assert(ggml_is_contiguous_1(dst));
7736
+ assert(ggml_are_same_shape(src1, dst));
7737
+ assert(ggml_are_same_shape(src1, grad));
7738
+
7739
+ const int ith = params->ith;
7740
+ const int nth = params->nth;
7741
+
7742
+ const int nc = src1->ne[0];
7743
+ const int nr = ggml_nrows(src1);
7744
+
7745
+ // rows per thread
7746
+ const int dr = (nr + nth - 1)/nth;
7747
+
7748
+ // row range for this thread
7749
+ const int ir0 = dr*ith;
7750
+ const int ir1 = MIN(ir0 + dr, nr);
7751
+
7752
+ for (int i1 = ir0; i1 < ir1; i1++) {
7753
+ ggml_vec_silu_backward_f16(nc,
7754
+ (ggml_fp16_t *) ((char *) dst->data + i1*( dst->nb[1])),
7755
+ (ggml_fp16_t *) ((char *) src1->data + i1*(src1->nb[1])),
7756
+ (ggml_fp16_t *) ((char *) grad->data + i1*(grad->nb[1])));
7757
+
7758
+ #ifndef NDEBUG
7759
+ for (int k = 0; k < nc; k++) {
7760
+ const float x = ((ggml_fp16_t *) ((char *) dst->data + i1*( dst->nb[1])))[k];
7761
+ const float v = GGML_FP16_TO_FP32(x);
7762
+ UNUSED(v);
7763
+ assert(!isnan(v));
7764
+ assert(!isinf(v));
7765
+ }
7766
+ #endif
7767
+ }
7768
+ }
7769
+
7770
  static void ggml_compute_forward_silu_back(
7771
  const struct ggml_compute_params * params,
7772
  struct ggml_tensor * dst) {
 
7778
  {
7779
  ggml_compute_forward_silu_back_f32(params, dst);
7780
  } break;
7781
+ case GGML_TYPE_F16:
7782
+ {
7783
+ ggml_compute_forward_silu_back_f16(params, dst);
7784
+ } break;
7785
  default:
7786
  {
7787
  GGML_ABORT("fatal error");
 
7789
  }
7790
  }
7791
 
 
7792
  static void ggml_compute_forward_hardswish_f32(
7793
  const struct ggml_compute_params * params,
7794
  struct ggml_tensor * dst) {
 
7812
  (float *) ((char *) src0->data + i*(src0->nb[1])));
7813
  }
7814
  }
7815
+
7816
+ static void ggml_compute_forward_hardswish_f16(
7817
+ const struct ggml_compute_params * params,
7818
+ struct ggml_tensor * dst) {
7819
+
7820
+ const struct ggml_tensor * src0 = dst->src[0];
7821
+
7822
+ if (params->ith != 0) {
7823
+ return;
7824
+ }
7825
+
7826
+ assert(ggml_is_contiguous_1(src0));
7827
+ assert(ggml_is_contiguous_1(dst));
7828
+ assert(ggml_are_same_shape(src0, dst));
7829
+
7830
+ const int n = ggml_nrows(src0);
7831
+ const int nc = src0->ne[0];
7832
+
7833
+ for (int i = 0; i < n; i++) {
7834
+ ggml_vec_hardswish_f16(nc,
7835
+ (ggml_fp16_t *) ((char *) dst->data + i*( dst->nb[1])),
7836
+ (ggml_fp16_t *) ((char *) src0->data + i*(src0->nb[1])));
7837
+ }
7838
+ }
7839
+
7840
  static void ggml_compute_forward_hardswish(
7841
  const struct ggml_compute_params * params,
7842
  struct ggml_tensor * dst) {
 
7848
  {
7849
  ggml_compute_forward_hardswish_f32(params, dst);
7850
  } break;
7851
+ case GGML_TYPE_F16:
7852
+ {
7853
+ ggml_compute_forward_hardswish_f16(params, dst);
7854
+ } break;
7855
  default:
7856
  {
7857
  GGML_ABORT("fatal error");
 
7883
  }
7884
  }
7885
 
7886
+ static void ggml_compute_forward_hardsigmoid_f16(
7887
+ const struct ggml_compute_params * params,
7888
+ struct ggml_tensor * dst) {
7889
+
7890
+ const struct ggml_tensor * src0 = dst->src[0];
7891
+
7892
+ if (params->ith != 0) {
7893
+ return;
7894
+ }
7895
+
7896
+ assert(ggml_is_contiguous_1(src0));
7897
+ assert(ggml_is_contiguous_1(dst));
7898
+ assert(ggml_are_same_shape(src0, dst));
7899
+
7900
+ const int n = ggml_nrows(src0);
7901
+ const int nc = src0->ne[0];
7902
+
7903
+ for (int i = 0; i < n; i++) {
7904
+ ggml_vec_hardsigmoid_f16(nc,
7905
+ (ggml_fp16_t *) ((char *) dst->data + i*( dst->nb[1])),
7906
+ (ggml_fp16_t *) ((char *) src0->data + i*(src0->nb[1])));
7907
+ }
7908
+ }
7909
+
7910
  static void ggml_compute_forward_hardsigmoid(
7911
  const struct ggml_compute_params * params,
7912
  struct ggml_tensor * dst) {
 
7918
  {
7919
  ggml_compute_forward_hardsigmoid_f32(params, dst);
7920
  } break;
7921
+ case GGML_TYPE_F16:
7922
+ {
7923
+ ggml_compute_forward_hardsigmoid_f16(params, dst);
7924
+ } break;
7925
  default:
7926
  {
7927
  GGML_ABORT("fatal error");
 
7953
  }
7954
  }
7955
 
7956
+ static void ggml_compute_forward_exp_f16(
7957
+ const struct ggml_compute_params * params,
7958
+ struct ggml_tensor * dst) {
7959
+
7960
+ const struct ggml_tensor * src0 = dst->src[0];
7961
+
7962
+ if (params->ith != 0) {
7963
+ return;
7964
+ }
7965
+
7966
+ assert(ggml_is_contiguous_1(src0));
7967
+ assert(ggml_is_contiguous_1(dst));
7968
+ assert(ggml_are_same_shape(src0, dst));
7969
+
7970
+ const int n = ggml_nrows(src0);
7971
+ const int nc = src0->ne[0];
7972
+
7973
+ for (int i = 0; i < n; i++) {
7974
+ ggml_vec_exp_f16(nc,
7975
+ (ggml_fp16_t *) ((char *) dst->data + i*( dst->nb[1])),
7976
+ (ggml_fp16_t *) ((char *) src0->data + i*(src0->nb[1])));
7977
+ }
7978
+ }
7979
+
7980
  static void ggml_compute_forward_exp(
7981
  const struct ggml_compute_params * params,
7982
  struct ggml_tensor * dst) {
 
7988
  {
7989
  ggml_compute_forward_exp_f32(params, dst);
7990
  } break;
7991
+ case GGML_TYPE_F16:
7992
+ {
7993
+ ggml_compute_forward_exp_f16(params, dst);
7994
+ } break;
7995
  default:
7996
  {
7997
  GGML_ABORT("fatal error");
 
10276
  }
10277
  }
10278
 
10279
+ static void ggml_compute_forward_clamp_f16(
10280
+ const struct ggml_compute_params * params,
10281
+ struct ggml_tensor * dst) {
10282
+
10283
+ const struct ggml_tensor * src0 = dst->src[0];
10284
+
10285
+ float min;
10286
+ float max;
10287
+ memcpy(&min, (float *) dst->op_params + 0, sizeof(float));
10288
+ memcpy(&max, (float *) dst->op_params + 1, sizeof(float));
10289
+
10290
+ const int ith = params->ith;
10291
+ const int nth = params->nth;
10292
+
10293
+ const int n = ggml_nrows(src0);
10294
+ const int nc = src0->ne[0];
10295
+
10296
+ const size_t nb00 = src0->nb[0];
10297
+ const size_t nb01 = src0->nb[1];
10298
+
10299
+ const size_t nb0 = dst->nb[0];
10300
+ const size_t nb1 = dst->nb[1];
10301
+
10302
+ GGML_ASSERT( nb0 == sizeof(ggml_fp16_t));
10303
+ GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
10304
+
10305
+ for (int j = ith; j < n; j += nth) {
10306
+ ggml_fp16_t * dst_ptr = (ggml_fp16_t *) ((char *) dst->data + j*nb1);
10307
+ ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + j*nb01);
10308
+
10309
+ for (int i = 0; i < nc; i++) {
10310
+ float v = GGML_FP16_TO_FP32(src0_ptr[i]);
10311
+ dst_ptr[i] = GGML_FP32_TO_FP16(MAX(MIN(v, max), min));
10312
+ }
10313
+ }
10314
+ }
10315
+
10316
  static void ggml_compute_forward_clamp(
10317
  const struct ggml_compute_params * params,
10318
  struct ggml_tensor * dst) {
 
10325
  ggml_compute_forward_clamp_f32(params, dst);
10326
  } break;
10327
  case GGML_TYPE_F16:
10328
+ {
10329
+ ggml_compute_forward_clamp_f16(params, dst);
10330
+ } break;
10331
  case GGML_TYPE_BF16:
10332
  case GGML_TYPE_Q4_0:
10333
  case GGML_TYPE_Q4_1:
ggml/src/ggml-cuda/clamp.cu CHANGED
@@ -1,6 +1,7 @@
1
  #include "clamp.cuh"
2
 
3
- static __global__ void clamp_f32(const float * x, float * dst, const float min, const float max, const int k) {
 
4
  const int i = blockDim.x*blockIdx.x + threadIdx.x;
5
 
6
  if (i >= k) {
@@ -10,25 +11,31 @@ static __global__ void clamp_f32(const float * x, float * dst, const float min,
10
  dst[i] = x[i] < min ? min : (x[i] > max ? max : x[i]);
11
  }
12
 
13
- static void clamp_f32_cuda(const float * x, float * dst, const float min, const float max, const int k, cudaStream_t stream) {
 
14
  const int num_blocks = (k + CUDA_CLAMP_BLOCK_SIZE - 1) / CUDA_CLAMP_BLOCK_SIZE;
15
- clamp_f32<<<num_blocks, CUDA_CLAMP_BLOCK_SIZE, 0, stream>>>(x, dst, min, max, k);
16
  }
17
 
18
 
19
  void ggml_cuda_op_clamp(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
20
  const ggml_tensor * src0 = dst->src[0];
21
- const float * src0_d = (const float *)src0->data;
22
- float * dst_d = (float *)dst->data;
23
  cudaStream_t stream = ctx.stream();
24
 
25
- GGML_ASSERT(src0->type == GGML_TYPE_F32);
26
- GGML_ASSERT( dst->type == GGML_TYPE_F32);
 
27
 
28
  float min;
29
  float max;
30
  memcpy(&min, dst->op_params, sizeof(float));
31
  memcpy(&max, (float *) dst->op_params + 1, sizeof(float));
32
 
33
- clamp_f32_cuda(src0_d, dst_d, min, max, ggml_nelements(src0), stream);
 
 
 
 
34
  }
 
1
  #include "clamp.cuh"
2
 
3
+ template <class T>
4
+ static __global__ void op_clamp(const T * x, T * dst, const T min, const T max, const int k) {
5
  const int i = blockDim.x*blockIdx.x + threadIdx.x;
6
 
7
  if (i >= k) {
 
11
  dst[i] = x[i] < min ? min : (x[i] > max ? max : x[i]);
12
  }
13
 
14
+ template <class T>
15
+ static void clamp_cuda(const T * x, T * dst, const T min, const T max, const int k, cudaStream_t stream) {
16
  const int num_blocks = (k + CUDA_CLAMP_BLOCK_SIZE - 1) / CUDA_CLAMP_BLOCK_SIZE;
17
+ op_clamp<<<num_blocks, CUDA_CLAMP_BLOCK_SIZE, 0, stream>>>(x, dst, min, max, k);
18
  }
19
 
20
 
21
  void ggml_cuda_op_clamp(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
22
  const ggml_tensor * src0 = dst->src[0];
23
+ const void * src0_d = src0->data;
24
+ void * dst_d = dst->data;
25
  cudaStream_t stream = ctx.stream();
26
 
27
+ GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
28
+ GGML_ASSERT( dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
29
+ GGML_ASSERT(src0->type == dst->type);
30
 
31
  float min;
32
  float max;
33
  memcpy(&min, dst->op_params, sizeof(float));
34
  memcpy(&max, (float *) dst->op_params + 1, sizeof(float));
35
 
36
+ if (src0->type == GGML_TYPE_F16) {
37
+ clamp_cuda((const half *)src0_d, (half *)dst_d, (half)min, (half)max, ggml_nelements(src0), stream);
38
+ } else {
39
+ clamp_cuda((const float *)src0_d, (float *)dst_d, (float)min, (float)max, ggml_nelements(src0), stream);
40
+ }
41
  }
ggml/src/ggml-cuda/ggml-cuda.cu CHANGED
@@ -2145,6 +2145,12 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
2145
  break;
2146
  case GGML_OP_UNARY:
2147
  switch (ggml_get_unary_op(dst)) {
 
 
 
 
 
 
2148
  case GGML_UNARY_OP_NEG:
2149
  ggml_cuda_op_neg(ctx, dst);
2150
  break;
@@ -2242,6 +2248,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
2242
  case GGML_OP_CLAMP:
2243
  ggml_cuda_op_clamp(ctx, dst);
2244
  break;
 
 
 
2245
  case GGML_OP_NONE:
2246
  case GGML_OP_RESHAPE:
2247
  case GGML_OP_VIEW:
@@ -2960,6 +2969,8 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
2960
  switch (op->op) {
2961
  case GGML_OP_UNARY:
2962
  switch (ggml_get_unary_op(op)) {
 
 
2963
  case GGML_UNARY_OP_NEG:
2964
  case GGML_UNARY_OP_STEP:
2965
  case GGML_UNARY_OP_GELU:
@@ -3166,6 +3177,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
3166
  case GGML_OP_SIN:
3167
  case GGML_OP_COS:
3168
  case GGML_OP_CLAMP:
 
3169
  return true;
3170
  case GGML_OP_CONT:
3171
  return op->src[0]->type != GGML_TYPE_BF16;
 
2145
  break;
2146
  case GGML_OP_UNARY:
2147
  switch (ggml_get_unary_op(dst)) {
2148
+ case GGML_UNARY_OP_ABS:
2149
+ ggml_cuda_op_abs(ctx, dst);
2150
+ break;
2151
+ case GGML_UNARY_OP_SGN:
2152
+ ggml_cuda_op_sgn(ctx, dst);
2153
+ break;
2154
  case GGML_UNARY_OP_NEG:
2155
  ggml_cuda_op_neg(ctx, dst);
2156
  break;
 
2248
  case GGML_OP_CLAMP:
2249
  ggml_cuda_op_clamp(ctx, dst);
2250
  break;
2251
+ case GGML_OP_LOG:
2252
+ ggml_cuda_op_log(ctx, dst);
2253
+ break;
2254
  case GGML_OP_NONE:
2255
  case GGML_OP_RESHAPE:
2256
  case GGML_OP_VIEW:
 
2969
  switch (op->op) {
2970
  case GGML_OP_UNARY:
2971
  switch (ggml_get_unary_op(op)) {
2972
+ case GGML_UNARY_OP_ABS:
2973
+ case GGML_UNARY_OP_SGN:
2974
  case GGML_UNARY_OP_NEG:
2975
  case GGML_UNARY_OP_STEP:
2976
  case GGML_UNARY_OP_GELU:
 
3177
  case GGML_OP_SIN:
3178
  case GGML_OP_COS:
3179
  case GGML_OP_CLAMP:
3180
+ case GGML_OP_LOG:
3181
  return true;
3182
  case GGML_OP_CONT:
3183
  return op->src[0]->type != GGML_TYPE_BF16;
ggml/src/ggml-cuda/unary.cu CHANGED
@@ -1,6 +1,29 @@
1
  #include "unary.cuh"
2
 
3
- static __global__ void neg_f32(const float * x, float * dst, const int k) {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
  const int i = blockDim.x*blockIdx.x + threadIdx.x;
5
 
6
  if (i >= k) {
@@ -10,61 +33,67 @@ static __global__ void neg_f32(const float * x, float * dst, const int k) {
10
  dst[i] = -x[i];
11
  }
12
 
13
- static __global__ void step_f32(const float * x, float * dst, const int k) {
 
14
  const int i = blockDim.x*blockIdx.x + threadIdx.x;
15
 
16
  if (i >= k) {
17
  return;
18
  }
19
 
20
- dst[i] = x[i] > 0.0f;
21
  }
22
 
23
- static __global__ void gelu_f32(const float * x, float * dst, const int k) {
24
- const float GELU_COEF_A = 0.044715f;
25
- const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f;
 
26
  const int i = blockDim.x*blockIdx.x + threadIdx.x;
27
 
28
  if (i >= k) {
29
  return;
30
  }
31
 
32
- float xi = x[i];
33
- dst[i] = 0.5f*xi*(1.0f + tanhf(SQRT_2_OVER_PI*xi*(1.0f + GELU_COEF_A*xi*xi)));
34
  }
35
 
36
- static __global__ void gelu_quick_f32(const float * x, float * dst, int k) {
37
- const float GELU_QUICK_COEF = -1.702f;
 
38
  const int i = blockDim.x*blockIdx.x + threadIdx.x;
39
  if (i >= k) {
40
  return;
41
  }
42
- dst[i] = x[i] * (1.0f / (1.0f + expf(GELU_QUICK_COEF * x[i])));
43
  }
44
 
45
- static __global__ void silu_f32(const float * x, float * dst, const int k) {
 
46
  const int i = blockDim.x*blockIdx.x + threadIdx.x;
47
 
48
  if (i >= k) {
49
  return;
50
  }
51
- dst[i] = x[i] / (1.0f + expf(-x[i]));
52
  }
53
 
54
- static __global__ void silu_back_f32(
55
- const float * grad, const float * xf, float * dst, const int k) {
 
56
  const int i = blockDim.x*blockIdx.x + threadIdx.x;
57
 
58
  if (i >= k) {
59
  return;
60
  }
61
 
62
- const float xfi = xf[i];
63
- const float s = 1.0f / (1.0f + expf(-xfi));
64
- dst[i] = grad[i] * s * (1.0f + xfi * (1.0f - s));
65
  }
66
 
67
- static __global__ void tanh_f32(const float * x, float * dst, int k) {
 
68
  const int i = blockDim.x*blockIdx.x + threadIdx.x;
69
  if (i >= k) {
70
  return;
@@ -72,7 +101,8 @@ static __global__ void tanh_f32(const float * x, float * dst, int k) {
72
  dst[i] = tanhf(x[i]);
73
  }
74
 
75
- static __global__ void relu_f32(const float * x, float * dst, const int k) {
 
76
  const int i = blockDim.x*blockIdx.x + threadIdx.x;
77
 
78
  if (i >= k) {
@@ -81,34 +111,38 @@ static __global__ void relu_f32(const float * x, float * dst, const int k) {
81
  dst[i] = fmaxf(x[i], 0);
82
  }
83
 
84
- static __global__ void sigmoid_f32(const float * x, float * dst, const int k) {
 
85
  const int i = blockDim.x*blockIdx.x + threadIdx.x;
86
 
87
  if (i >= k) {
88
  return;
89
  }
90
- dst[i] = 1.0f / (1.0f + expf(-x[i]));
91
  }
92
 
93
- static __global__ void hardsigmoid_f32(const float * x, float * dst, const int k) {
 
94
  const int i = blockDim.x*blockIdx.x + threadIdx.x;
95
 
96
  if (i >= k) {
97
  return;
98
  }
99
- dst[i] = fminf(1.0f, fmaxf(0.0f, (x[i] + 3.0f) / 6.0f));
100
  }
101
 
102
- static __global__ void hardswish_f32(const float * x, float * dst, const int k) {
 
103
  const int i = blockDim.x*blockIdx.x + threadIdx.x;
104
 
105
  if (i >= k) {
106
  return;
107
  }
108
- dst[i] = x[i] * fminf(1.0f, fmaxf(0.0f, (x[i] + 3.0f) / 6.0f));
109
  }
110
 
111
- static __global__ void exp_f32(const float * x, float * dst, const int k) {
 
112
  const int i = blockDim.x*blockIdx.x + threadIdx.x;
113
 
114
  if (i >= k) {
@@ -117,15 +151,17 @@ static __global__ void exp_f32(const float * x, float * dst, const int k) {
117
  dst[i] = expf(x[i]);
118
  }
119
 
120
- static __global__ void leaky_relu_f32(const float * x, float * dst, const int k, const float negative_slope) {
 
121
  const int i = blockDim.x*blockIdx.x + threadIdx.x;
122
  if (i >= k) {
123
  return;
124
  }
125
- dst[i] = fmaxf(x[i], 0) + fminf(x[i], 0.0f) * negative_slope;
126
  }
127
 
128
- static __global__ void sqr_f32(const float * x, float * dst, const int k) {
 
129
  const int i = blockDim.x*blockIdx.x + threadIdx.x;
130
 
131
  if (i >= k) {
@@ -134,7 +170,8 @@ static __global__ void sqr_f32(const float * x, float * dst, const int k) {
134
  dst[i] = x[i] * x[i];
135
  }
136
 
137
- static __global__ void sqrt_f32(const float * x, float * dst, const int k) {
 
138
  const int i = blockDim.x*blockIdx.x + threadIdx.x;
139
 
140
  if (i >= k) {
@@ -143,7 +180,8 @@ static __global__ void sqrt_f32(const float * x, float * dst, const int k) {
143
  dst[i] = sqrtf(x[i]);
144
  }
145
 
146
- static __global__ void sin_f32(const float * x, float * dst, const int k) {
 
147
  const int i = blockDim.x*blockIdx.x + threadIdx.x;
148
 
149
  if (i >= k) {
@@ -152,7 +190,8 @@ static __global__ void sin_f32(const float * x, float * dst, const int k) {
152
  dst[i] = sinf(x[i]);
153
  }
154
 
155
- static __global__ void cos_f32(const float * x, float * dst, const int k) {
 
156
  const int i = blockDim.x*blockIdx.x + threadIdx.x;
157
 
158
  if (i >= k) {
@@ -161,145 +200,248 @@ static __global__ void cos_f32(const float * x, float * dst, const int k) {
161
  dst[i] = cosf(x[i]);
162
  }
163
 
164
- static void neg_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
165
  const int num_blocks = (k + CUDA_NEG_BLOCK_SIZE - 1) / CUDA_NEG_BLOCK_SIZE;
166
- neg_f32<<<num_blocks, CUDA_NEG_BLOCK_SIZE, 0, stream>>>(x, dst, k);
167
  }
168
 
169
- static void step_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
 
170
  const int num_blocks = (k + CUDA_STEP_BLOCK_SIZE - 1) / CUDA_STEP_BLOCK_SIZE;
171
- step_f32<<<num_blocks, CUDA_STEP_BLOCK_SIZE, 0, stream>>>(x, dst, k);
172
  }
173
 
174
- static void gelu_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
 
175
  const int num_blocks = (k + CUDA_GELU_BLOCK_SIZE - 1) / CUDA_GELU_BLOCK_SIZE;
176
- gelu_f32<<<num_blocks, CUDA_GELU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
177
  }
178
 
179
- static void gelu_quick_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
 
180
  const int num_blocks = (k + CUDA_GELU_BLOCK_SIZE - 1) / CUDA_GELU_BLOCK_SIZE;
181
- gelu_quick_f32<<<num_blocks, CUDA_GELU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
182
  }
183
 
184
- static void silu_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
 
185
  const int num_blocks = (k + CUDA_SILU_BLOCK_SIZE - 1) / CUDA_SILU_BLOCK_SIZE;
186
- silu_f32<<<num_blocks, CUDA_SILU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
187
  }
188
 
189
- static void silu_back_f32_cuda(const float * grad, const float * x, float * dst, const int k, cudaStream_t stream) {
 
190
  const int num_blocks = (k + CUDA_SILU_BACK_BLOCK_SIZE - 1) / CUDA_SILU_BLOCK_SIZE;
191
- silu_back_f32<<<num_blocks, CUDA_SILU_BACK_BLOCK_SIZE, 0, stream>>>(grad, x, dst, k);
192
  }
193
 
194
- static void tanh_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
 
195
  const int num_blocks = (k + CUDA_TANH_BLOCK_SIZE - 1) / CUDA_TANH_BLOCK_SIZE;
196
- tanh_f32<<<num_blocks, CUDA_TANH_BLOCK_SIZE, 0, stream>>>(x, dst, k);
197
  }
198
 
199
- static void relu_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
 
200
  const int num_blocks = (k + CUDA_RELU_BLOCK_SIZE - 1) / CUDA_RELU_BLOCK_SIZE;
201
- relu_f32<<<num_blocks, CUDA_RELU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
202
  }
203
 
204
- static void sigmoid_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
 
205
  const int num_blocks = (k + CUDA_SIGMOID_BLOCK_SIZE - 1) / CUDA_SIGMOID_BLOCK_SIZE;
206
- sigmoid_f32<<<num_blocks, CUDA_SIGMOID_BLOCK_SIZE, 0, stream>>>(x, dst, k);
207
  }
208
 
209
- static void hardsigmoid_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
 
210
  const int num_blocks = (k + CUDA_HARDSIGMOID_BLOCK_SIZE - 1) / CUDA_HARDSIGMOID_BLOCK_SIZE;
211
- hardsigmoid_f32<<<num_blocks, CUDA_HARDSIGMOID_BLOCK_SIZE, 0, stream>>>(x, dst, k);
212
  }
213
 
214
- static void hardswish_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
 
215
  const int num_blocks = (k + CUDA_HARDSWISH_BLOCK_SIZE - 1) / CUDA_HARDSWISH_BLOCK_SIZE;
216
- hardswish_f32<<<num_blocks, CUDA_HARDSWISH_BLOCK_SIZE, 0, stream>>>(x, dst, k);
217
  }
218
 
219
- static void exp_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
 
220
  const int num_blocks = (k + CUDA_EXP_BLOCK_SIZE - 1) / CUDA_EXP_BLOCK_SIZE;
221
- exp_f32<<<num_blocks, CUDA_EXP_BLOCK_SIZE, 0, stream>>>(x, dst, k);
222
  }
223
 
224
- static void leaky_relu_f32_cuda(const float * x, float * dst, const int k, const float negative_slope, cudaStream_t stream) {
 
225
  const int num_blocks = (k + CUDA_RELU_BLOCK_SIZE - 1) / CUDA_RELU_BLOCK_SIZE;
226
- leaky_relu_f32<<<num_blocks, CUDA_RELU_BLOCK_SIZE, 0, stream>>>(x, dst, k, negative_slope);
227
  }
228
 
229
- static void sqr_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
 
230
  const int num_blocks = (k + CUDA_SQR_BLOCK_SIZE - 1) / CUDA_SQR_BLOCK_SIZE;
231
- sqr_f32<<<num_blocks, CUDA_SQR_BLOCK_SIZE, 0, stream>>>(x, dst, k);
232
  }
233
 
234
- static void sqrt_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
 
235
  const int num_blocks = (k + CUDA_SQRT_BLOCK_SIZE - 1) / CUDA_SQRT_BLOCK_SIZE;
236
- sqrt_f32<<<num_blocks, CUDA_SQRT_BLOCK_SIZE, 0, stream>>>(x, dst, k);
237
  }
238
 
239
- static void sin_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
 
240
  const int num_blocks = (k + CUDA_SIN_BLOCK_SIZE - 1) / CUDA_SIN_BLOCK_SIZE;
241
- sin_f32<<<num_blocks, CUDA_SIN_BLOCK_SIZE, 0, stream>>>(x, dst, k);
242
  }
243
 
244
- static void cos_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
 
245
  const int num_blocks = (k + CUDA_COS_BLOCK_SIZE - 1) / CUDA_COS_BLOCK_SIZE;
246
- cos_f32<<<num_blocks, CUDA_COS_BLOCK_SIZE, 0, stream>>>(x, dst, k);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
247
  }
248
 
249
  void ggml_cuda_op_neg(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
250
  const ggml_tensor * src0 = dst->src[0];
251
- const float * src0_d = (const float *)src0->data;
252
- float * dst_d = (float *)dst->data;
253
  cudaStream_t stream = ctx.stream();
254
 
255
  GGML_ASSERT(ggml_is_contiguous(src0));
256
 
257
- GGML_ASSERT(src0->type == GGML_TYPE_F32);
258
- GGML_ASSERT( dst->type == GGML_TYPE_F32);
 
259
 
260
- neg_f32_cuda(src0_d, dst_d, ggml_nelements(src0), stream);
 
 
 
 
261
  }
262
 
263
  void ggml_cuda_op_step(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
264
  const ggml_tensor * src0 = dst->src[0];
265
- const float * src0_d = (const float *)src0->data;
266
- float * dst_d = (float *)dst->data;
267
  cudaStream_t stream = ctx.stream();
268
 
269
  GGML_ASSERT(ggml_is_contiguous(src0));
270
 
271
- GGML_ASSERT(src0->type == GGML_TYPE_F32);
272
- GGML_ASSERT( dst->type == GGML_TYPE_F32);
 
273
 
274
- step_f32_cuda(src0_d, dst_d, ggml_nelements(src0), stream);
 
 
 
 
275
  }
276
 
277
  void ggml_cuda_op_gelu(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
278
  const ggml_tensor * src0 = dst->src[0];
279
- const float * src0_d = (const float *)src0->data;
280
- float * dst_d = (float *)dst->data;
281
  cudaStream_t stream = ctx.stream();
282
 
283
  GGML_ASSERT(ggml_is_contiguous(src0));
284
 
285
- GGML_ASSERT(src0->type == GGML_TYPE_F32);
286
- GGML_ASSERT( dst->type == GGML_TYPE_F32);
 
287
 
288
- gelu_f32_cuda(src0_d, dst_d, ggml_nelements(src0), stream);
 
 
 
 
289
  }
290
 
291
  void ggml_cuda_op_silu(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
292
  const ggml_tensor * src0 = dst->src[0];
293
- const float * src0_d = (const float *)src0->data;
294
- float * dst_d = (float *)dst->data;
295
  cudaStream_t stream = ctx.stream();
296
 
297
  GGML_ASSERT(ggml_is_contiguous(src0));
298
 
299
- GGML_ASSERT(src0->type == GGML_TYPE_F32);
300
- GGML_ASSERT( dst->type == GGML_TYPE_F32);
 
301
 
302
- silu_f32_cuda(src0_d, dst_d, ggml_nelements(src0), stream);
 
 
 
 
303
  }
304
 
305
  void ggml_cuda_op_silu_back(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
@@ -314,179 +456,263 @@ void ggml_cuda_op_silu_back(ggml_backend_cuda_context & ctx, ggml_tensor * dst)
314
 
315
  GGML_ASSERT(ggml_is_contiguous(src0));
316
 
317
- GGML_ASSERT(src0->type == GGML_TYPE_F32);
318
- GGML_ASSERT( dst->type == GGML_TYPE_F32);
 
319
 
320
- silu_back_f32_cuda(src0_d, src1_d, dst_d, ggml_nelements(src0), stream);
 
 
 
 
321
  }
322
 
323
  void ggml_cuda_op_gelu_quick(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
324
  const ggml_tensor * src0 = dst->src[0];
325
- const float * src0_d = (const float *)src0->data;
326
- float * dst_d = (float *)dst->data;
327
  cudaStream_t stream = ctx.stream();
328
 
329
  GGML_ASSERT(ggml_is_contiguous(src0));
330
 
331
- GGML_ASSERT(src0->type == GGML_TYPE_F32);
332
- GGML_ASSERT( dst->type == GGML_TYPE_F32);
 
333
 
334
- gelu_quick_f32_cuda(src0_d, dst_d, ggml_nelements(src0), stream);
 
 
 
 
335
  }
336
 
337
  void ggml_cuda_op_tanh(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
338
  const ggml_tensor * src0 = dst->src[0];
339
- const float * src0_d = (const float *)src0->data;
340
- float * dst_d = (float *)dst->data;
341
  cudaStream_t stream = ctx.stream();
342
 
343
  GGML_ASSERT(ggml_is_contiguous(src0));
344
 
345
- GGML_ASSERT(src0->type == GGML_TYPE_F32);
346
- GGML_ASSERT( dst->type == GGML_TYPE_F32);
 
347
 
348
- tanh_f32_cuda(src0_d, dst_d, ggml_nelements(src0), stream);
 
 
 
 
349
  }
350
 
351
  void ggml_cuda_op_relu(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
352
  const ggml_tensor * src0 = dst->src[0];
353
- const float * src0_d = (const float *)src0->data;
354
- float * dst_d = (float *)dst->data;
355
  cudaStream_t stream = ctx.stream();
356
 
357
  GGML_ASSERT(ggml_is_contiguous(src0));
358
 
359
- GGML_ASSERT(src0->type == GGML_TYPE_F32);
360
- GGML_ASSERT( dst->type == GGML_TYPE_F32);
 
361
 
362
- relu_f32_cuda(src0_d, dst_d, ggml_nelements(src0), stream);
 
 
 
 
363
  }
364
 
365
  void ggml_cuda_op_sigmoid(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
366
  const ggml_tensor * src0 = dst->src[0];
367
- const float * src0_d = (const float *)src0->data;
368
- float * dst_d = (float *)dst->data;
369
  cudaStream_t stream = ctx.stream();
370
 
371
  GGML_ASSERT(ggml_is_contiguous(src0));
372
 
373
- GGML_ASSERT(src0->type == GGML_TYPE_F32);
374
- GGML_ASSERT( dst->type == GGML_TYPE_F32);
 
375
 
376
- sigmoid_f32_cuda(src0_d, dst_d, ggml_nelements(src0), stream);
 
 
 
 
377
  }
378
 
379
  void ggml_cuda_op_hardsigmoid(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
380
  const ggml_tensor * src0 = dst->src[0];
381
- const float * src0_d = (const float *)src0->data;
382
- float * dst_d = (float *)dst->data;
383
  cudaStream_t stream = ctx.stream();
384
 
385
  GGML_ASSERT(ggml_is_contiguous(src0));
386
 
387
- GGML_ASSERT(src0->type == GGML_TYPE_F32);
388
- GGML_ASSERT( dst->type == GGML_TYPE_F32);
 
389
 
390
- hardsigmoid_f32_cuda(src0_d, dst_d, ggml_nelements(src0), stream);
 
 
 
 
391
  }
392
 
393
  void ggml_cuda_op_hardswish(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
394
  const ggml_tensor * src0 = dst->src[0];
395
- const float * src0_d = (const float *)src0->data;
396
- float * dst_d = (float *)dst->data;
397
  cudaStream_t stream = ctx.stream();
398
 
399
  GGML_ASSERT(ggml_is_contiguous(src0));
400
 
401
- GGML_ASSERT(src0->type == GGML_TYPE_F32);
402
- GGML_ASSERT( dst->type == GGML_TYPE_F32);
 
403
 
404
- hardswish_f32_cuda(src0_d, dst_d, ggml_nelements(src0), stream);
 
 
 
 
405
  }
406
 
407
  void ggml_cuda_op_exp(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
408
  const ggml_tensor * src0 = dst->src[0];
409
- const float * src0_d = (const float *)src0->data;
410
- float * dst_d = (float *)dst->data;
411
  cudaStream_t stream = ctx.stream();
412
 
413
  GGML_ASSERT(ggml_is_contiguous(src0));
414
 
415
- GGML_ASSERT(src0->type == GGML_TYPE_F32);
416
- GGML_ASSERT( dst->type == GGML_TYPE_F32);
 
417
 
418
- exp_f32_cuda(src0_d, dst_d, ggml_nelements(src0), stream);
 
 
 
 
419
  }
420
 
421
  void ggml_cuda_op_leaky_relu(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
422
  const ggml_tensor * src0 = dst->src[0];
423
- const float * src0_d = (const float *)src0->data;
424
- float * dst_d = (float *)dst->data;
425
  cudaStream_t stream = ctx.stream();
426
 
427
  GGML_ASSERT(ggml_is_contiguous(src0));
428
 
429
- GGML_ASSERT(src0->type == GGML_TYPE_F32);
430
- GGML_ASSERT( dst->type == GGML_TYPE_F32);
 
431
 
432
  float negative_slope;
433
  memcpy(&negative_slope, dst->op_params, sizeof(float));
434
 
435
- leaky_relu_f32_cuda(src0_d, dst_d, ggml_nelements(src0), negative_slope, stream);
 
 
 
 
436
  }
437
 
438
  void ggml_cuda_op_sqr(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
439
  const ggml_tensor * src0 = dst->src[0];
440
- const float * src0_d = (const float *)src0->data;
441
- float * dst_d = (float *)dst->data;
442
  cudaStream_t stream = ctx.stream();
443
 
444
  GGML_ASSERT(ggml_is_contiguous(src0));
445
 
446
- GGML_ASSERT(src0->type == GGML_TYPE_F32);
447
- GGML_ASSERT( dst->type == GGML_TYPE_F32);
 
448
 
449
- sqr_f32_cuda(src0_d, dst_d, ggml_nelements(src0), stream);
 
 
 
 
450
  }
451
 
452
  void ggml_cuda_op_sqrt(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
453
  const ggml_tensor * src0 = dst->src[0];
454
- const float * src0_d = (const float *)src0->data;
455
- float * dst_d = (float *)dst->data;
456
  cudaStream_t stream = ctx.stream();
457
 
458
  GGML_ASSERT(ggml_is_contiguous(src0));
459
 
460
- GGML_ASSERT(src0->type == GGML_TYPE_F32);
461
- GGML_ASSERT( dst->type == GGML_TYPE_F32);
 
462
 
463
- sqrt_f32_cuda(src0_d, dst_d, ggml_nelements(src0), stream);
 
 
 
 
464
  }
465
 
466
  void ggml_cuda_op_sin(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
467
  const ggml_tensor * src0 = dst->src[0];
468
- const float * src0_d = (const float *)src0->data;
469
- float * dst_d = (float *)dst->data;
470
  cudaStream_t stream = ctx.stream();
471
 
472
  GGML_ASSERT(ggml_is_contiguous(src0));
473
 
474
- GGML_ASSERT(src0->type == GGML_TYPE_F32);
475
- GGML_ASSERT( dst->type == GGML_TYPE_F32);
 
476
 
477
- sin_f32_cuda(src0_d, dst_d, ggml_nelements(src0), stream);
 
 
 
 
478
  }
479
 
480
  void ggml_cuda_op_cos(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
481
  const ggml_tensor * src0 = dst->src[0];
482
- const float * src0_d = (const float *)src0->data;
483
- float * dst_d = (float *)dst->data;
484
  cudaStream_t stream = ctx.stream();
485
 
486
  GGML_ASSERT(ggml_is_contiguous(src0));
487
 
488
- GGML_ASSERT(src0->type == GGML_TYPE_F32);
489
- GGML_ASSERT( dst->type == GGML_TYPE_F32);
 
490
 
491
- cos_f32_cuda(src0_d, dst_d, ggml_nelements(src0), stream);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
492
  }
 
1
  #include "unary.cuh"
2
 
3
+ template <class T>
4
+ static __global__ void op_abs(const T * x, T * dst, const int k) {
5
+ const int i = blockDim.x*blockIdx.x + threadIdx.x;
6
+
7
+ if (i >= k) {
8
+ return;
9
+ }
10
+
11
+ dst[i] = fabsf(x[i]);
12
+ }
13
+
14
+ template <class T>
15
+ static __global__ void op_sgn(const T * x, T * dst, const int k) {
16
+ const int i = blockDim.x*blockIdx.x + threadIdx.x;
17
+
18
+ if (i >= k) {
19
+ return;
20
+ }
21
+
22
+ dst[i] = (T)(x[i] > (T)0.f ? 1.f : ((x[i] < (T)0.f ? -1.f : 0.f)));
23
+ }
24
+
25
+ template <class T>
26
+ static __global__ void op_neg(const T * x, T * dst, const int k) {
27
  const int i = blockDim.x*blockIdx.x + threadIdx.x;
28
 
29
  if (i >= k) {
 
33
  dst[i] = -x[i];
34
  }
35
 
36
+ template <class T>
37
+ static __global__ void op_step(const T * x, T * dst, const int k) {
38
  const int i = blockDim.x*blockIdx.x + threadIdx.x;
39
 
40
  if (i >= k) {
41
  return;
42
  }
43
 
44
+ dst[i] = x[i] > (T)0.0f;
45
  }
46
 
47
+ template <class T>
48
+ static __global__ void op_gelu(const T * x, T * dst, const int k) {
49
+ const T GELU_COEF_A = 0.044715f;
50
+ const T SQRT_2_OVER_PI = 0.79788456080286535587989211986876f;
51
  const int i = blockDim.x*blockIdx.x + threadIdx.x;
52
 
53
  if (i >= k) {
54
  return;
55
  }
56
 
57
+ T xi = x[i];
58
+ dst[i] = (T)0.5f*xi*((T)1.0f + (T)tanhf(SQRT_2_OVER_PI*xi*((T)1.0f + GELU_COEF_A*xi*xi)));
59
  }
60
 
61
+ template <class T>
62
+ static __global__ void op_gelu_quick(const T * x, T * dst, int k) {
63
+ const T GELU_QUICK_COEF = -1.702f;
64
  const int i = blockDim.x*blockIdx.x + threadIdx.x;
65
  if (i >= k) {
66
  return;
67
  }
68
+ dst[i] = x[i] * ((T)1.0f / ((T)1.0f + (T)expf(GELU_QUICK_COEF * x[i])));
69
  }
70
 
71
+ template <class T>
72
+ static __global__ void op_silu(const T * x, T * dst, const int k) {
73
  const int i = blockDim.x*blockIdx.x + threadIdx.x;
74
 
75
  if (i >= k) {
76
  return;
77
  }
78
+ dst[i] = x[i] / ((T)1.0f + (T)expf(-x[i]));
79
  }
80
 
81
+ template <class T>
82
+ static __global__ void op_silu_back(
83
+ const T * grad, const T * xf, T * dst, const int k) {
84
  const int i = blockDim.x*blockIdx.x + threadIdx.x;
85
 
86
  if (i >= k) {
87
  return;
88
  }
89
 
90
+ const T xfi = xf[i];
91
+ const T s = (T)1.0f / ((T)1.0f + (T)expf(-xfi));
92
+ dst[i] = grad[i] * s * ((T)1.0f + xfi * ((T)1.0f - s));
93
  }
94
 
95
+ template <class T>
96
+ static __global__ void op_tanh(const T * x, T * dst, int k) {
97
  const int i = blockDim.x*blockIdx.x + threadIdx.x;
98
  if (i >= k) {
99
  return;
 
101
  dst[i] = tanhf(x[i]);
102
  }
103
 
104
+ template <class T>
105
+ static __global__ void op_relu(const T * x, T * dst, const int k) {
106
  const int i = blockDim.x*blockIdx.x + threadIdx.x;
107
 
108
  if (i >= k) {
 
111
  dst[i] = fmaxf(x[i], 0);
112
  }
113
 
114
+ template <class T>
115
+ static __global__ void op_sigmoid(const T * x, T * dst, const int k) {
116
  const int i = blockDim.x*blockIdx.x + threadIdx.x;
117
 
118
  if (i >= k) {
119
  return;
120
  }
121
+ dst[i] = (T)1.0f / ((T)1.0f + (T)expf(-x[i]));
122
  }
123
 
124
+ template <class T>
125
+ static __global__ void op_hardsigmoid(const T * x, T * dst, const int k) {
126
  const int i = blockDim.x*blockIdx.x + threadIdx.x;
127
 
128
  if (i >= k) {
129
  return;
130
  }
131
+ dst[i] = fminf(1.0f, fmaxf(0.0f, (x[i] + (T)3.0f) / (T)6.0f));
132
  }
133
 
134
+ template <class T>
135
+ static __global__ void op_hardswish(const T * x, T * dst, const int k) {
136
  const int i = blockDim.x*blockIdx.x + threadIdx.x;
137
 
138
  if (i >= k) {
139
  return;
140
  }
141
+ dst[i] = x[i] * (T)fminf(1.0f, fmaxf(0.0f, (x[i] + (T)3.0f) / (T)6.0f));
142
  }
143
 
144
+ template <class T>
145
+ static __global__ void op_exp(const T * x, T * dst, const int k) {
146
  const int i = blockDim.x*blockIdx.x + threadIdx.x;
147
 
148
  if (i >= k) {
 
151
  dst[i] = expf(x[i]);
152
  }
153
 
154
+ template <class T>
155
+ static __global__ void op_leaky_relu(const T * x, T * dst, const int k, const float negative_slope) {
156
  const int i = blockDim.x*blockIdx.x + threadIdx.x;
157
  if (i >= k) {
158
  return;
159
  }
160
+ dst[i] = (T)fmaxf(x[i], 0) + (T)fminf(x[i], 0.0f) * (T)negative_slope;
161
  }
162
 
163
+ template <class T>
164
+ static __global__ void op_sqr(const T * x, T * dst, const int k) {
165
  const int i = blockDim.x*blockIdx.x + threadIdx.x;
166
 
167
  if (i >= k) {
 
170
  dst[i] = x[i] * x[i];
171
  }
172
 
173
+ template <class T>
174
+ static __global__ void op_sqrt(const T * x, T * dst, const int k) {
175
  const int i = blockDim.x*blockIdx.x + threadIdx.x;
176
 
177
  if (i >= k) {
 
180
  dst[i] = sqrtf(x[i]);
181
  }
182
 
183
+ template <class T>
184
+ static __global__ void op_sin(const T * x, T * dst, const int k) {
185
  const int i = blockDim.x*blockIdx.x + threadIdx.x;
186
 
187
  if (i >= k) {
 
190
  dst[i] = sinf(x[i]);
191
  }
192
 
193
+ template <class T>
194
+ static __global__ void op_cos(const T * x, T * dst, const int k) {
195
  const int i = blockDim.x*blockIdx.x + threadIdx.x;
196
 
197
  if (i >= k) {
 
200
  dst[i] = cosf(x[i]);
201
  }
202
 
203
+ template <class T>
204
+ static __global__ void op_log(const T * x, T * dst, const int k) {
205
+ const int i = blockDim.x*blockIdx.x + threadIdx.x;
206
+
207
+ if (i >= k) {
208
+ return;
209
+ }
210
+ dst[i] = logf(x[i]);
211
+ }
212
+
213
+ template <class T>
214
+ static void abs_cuda(const T * x, T * dst, const int k, cudaStream_t stream) {
215
+ const int num_blocks = (k + CUDA_NEG_BLOCK_SIZE - 1) / CUDA_NEG_BLOCK_SIZE;
216
+ op_abs<<<num_blocks, CUDA_NEG_BLOCK_SIZE, 0, stream>>>(x, dst, k);
217
+ }
218
+
219
+ template <class T>
220
+ static void sgn_cuda(const T * x, T * dst, const int k, cudaStream_t stream) {
221
+ const int num_blocks = (k + CUDA_NEG_BLOCK_SIZE - 1) / CUDA_NEG_BLOCK_SIZE;
222
+ op_sgn<<<num_blocks, CUDA_NEG_BLOCK_SIZE, 0, stream>>>(x, dst, k);
223
+ }
224
+
225
+ template <class T>
226
+ static void neg_cuda(const T * x, T * dst, const int k, cudaStream_t stream) {
227
  const int num_blocks = (k + CUDA_NEG_BLOCK_SIZE - 1) / CUDA_NEG_BLOCK_SIZE;
228
+ op_neg<<<num_blocks, CUDA_NEG_BLOCK_SIZE, 0, stream>>>(x, dst, k);
229
  }
230
 
231
+ template <class T>
232
+ static void step_cuda(const T * x, T * dst, const int k, cudaStream_t stream) {
233
  const int num_blocks = (k + CUDA_STEP_BLOCK_SIZE - 1) / CUDA_STEP_BLOCK_SIZE;
234
+ op_step<<<num_blocks, CUDA_STEP_BLOCK_SIZE, 0, stream>>>(x, dst, k);
235
  }
236
 
237
+ template <class T>
238
+ static void gelu_cuda(const T * x, T * dst, const int k, cudaStream_t stream) {
239
  const int num_blocks = (k + CUDA_GELU_BLOCK_SIZE - 1) / CUDA_GELU_BLOCK_SIZE;
240
+ op_gelu<<<num_blocks, CUDA_GELU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
241
  }
242
 
243
+ template <class T>
244
+ static void gelu_quick_cuda(const T * x, T * dst, const int k, cudaStream_t stream) {
245
  const int num_blocks = (k + CUDA_GELU_BLOCK_SIZE - 1) / CUDA_GELU_BLOCK_SIZE;
246
+ op_gelu_quick<<<num_blocks, CUDA_GELU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
247
  }
248
 
249
+ template <class T>
250
+ static void silu_cuda(const T * x, T * dst, const int k, cudaStream_t stream) {
251
  const int num_blocks = (k + CUDA_SILU_BLOCK_SIZE - 1) / CUDA_SILU_BLOCK_SIZE;
252
+ op_silu<<<num_blocks, CUDA_SILU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
253
  }
254
 
255
+ template <class T>
256
+ static void silu_back_cuda(const T * grad, const T * x, T * dst, const int k, cudaStream_t stream) {
257
  const int num_blocks = (k + CUDA_SILU_BACK_BLOCK_SIZE - 1) / CUDA_SILU_BLOCK_SIZE;
258
+ op_silu_back<<<num_blocks, CUDA_SILU_BACK_BLOCK_SIZE, 0, stream>>>(grad, x, dst, k);
259
  }
260
 
261
+ template <class T>
262
+ static void tanh_cuda(const T * x, T * dst, const int k, cudaStream_t stream) {
263
  const int num_blocks = (k + CUDA_TANH_BLOCK_SIZE - 1) / CUDA_TANH_BLOCK_SIZE;
264
+ op_tanh<<<num_blocks, CUDA_TANH_BLOCK_SIZE, 0, stream>>>(x, dst, k);
265
  }
266
 
267
+ template <class T>
268
+ static void relu_cuda(const T * x, T * dst, const int k, cudaStream_t stream) {
269
  const int num_blocks = (k + CUDA_RELU_BLOCK_SIZE - 1) / CUDA_RELU_BLOCK_SIZE;
270
+ op_relu<<<num_blocks, CUDA_RELU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
271
  }
272
 
273
+ template <class T>
274
+ static void sigmoid_cuda(const T * x, T * dst, const int k, cudaStream_t stream) {
275
  const int num_blocks = (k + CUDA_SIGMOID_BLOCK_SIZE - 1) / CUDA_SIGMOID_BLOCK_SIZE;
276
+ op_sigmoid<<<num_blocks, CUDA_SIGMOID_BLOCK_SIZE, 0, stream>>>(x, dst, k);
277
  }
278
 
279
+ template <class T>
280
+ static void hardsigmoid_cuda(const T * x, T * dst, const int k, cudaStream_t stream) {
281
  const int num_blocks = (k + CUDA_HARDSIGMOID_BLOCK_SIZE - 1) / CUDA_HARDSIGMOID_BLOCK_SIZE;
282
+ op_hardsigmoid<<<num_blocks, CUDA_HARDSIGMOID_BLOCK_SIZE, 0, stream>>>(x, dst, k);
283
  }
284
 
285
+ template <class T>
286
+ static void hardswish_cuda(const T * x, T * dst, const int k, cudaStream_t stream) {
287
  const int num_blocks = (k + CUDA_HARDSWISH_BLOCK_SIZE - 1) / CUDA_HARDSWISH_BLOCK_SIZE;
288
+ op_hardswish<<<num_blocks, CUDA_HARDSWISH_BLOCK_SIZE, 0, stream>>>(x, dst, k);
289
  }
290
 
291
+ template <class T>
292
+ static void exp_cuda(const T * x, T * dst, const int k, cudaStream_t stream) {
293
  const int num_blocks = (k + CUDA_EXP_BLOCK_SIZE - 1) / CUDA_EXP_BLOCK_SIZE;
294
+ op_exp<<<num_blocks, CUDA_EXP_BLOCK_SIZE, 0, stream>>>(x, dst, k);
295
  }
296
 
297
+ template <class T>
298
+ static void leaky_relu_cuda(const T * x, T * dst, const int k, const float negative_slope, cudaStream_t stream) {
299
  const int num_blocks = (k + CUDA_RELU_BLOCK_SIZE - 1) / CUDA_RELU_BLOCK_SIZE;
300
+ op_leaky_relu<<<num_blocks, CUDA_RELU_BLOCK_SIZE, 0, stream>>>(x, dst, k, negative_slope);
301
  }
302
 
303
+ template <class T>
304
+ static void sqr_cuda(const T * x, T * dst, const int k, cudaStream_t stream) {
305
  const int num_blocks = (k + CUDA_SQR_BLOCK_SIZE - 1) / CUDA_SQR_BLOCK_SIZE;
306
+ op_sqr<<<num_blocks, CUDA_SQR_BLOCK_SIZE, 0, stream>>>(x, dst, k);
307
  }
308
 
309
+ template <class T>
310
+ static void sqrt_cuda(const T * x, T * dst, const int k, cudaStream_t stream) {
311
  const int num_blocks = (k + CUDA_SQRT_BLOCK_SIZE - 1) / CUDA_SQRT_BLOCK_SIZE;
312
+ op_sqrt<<<num_blocks, CUDA_SQRT_BLOCK_SIZE, 0, stream>>>(x, dst, k);
313
  }
314
 
315
+ template <class T>
316
+ static void sin_cuda(const T * x, T * dst, const int k, cudaStream_t stream) {
317
  const int num_blocks = (k + CUDA_SIN_BLOCK_SIZE - 1) / CUDA_SIN_BLOCK_SIZE;
318
+ op_sin<<<num_blocks, CUDA_SIN_BLOCK_SIZE, 0, stream>>>(x, dst, k);
319
  }
320
 
321
+ template <class T>
322
+ static void cos_cuda(const T * x, T * dst, const int k, cudaStream_t stream) {
323
  const int num_blocks = (k + CUDA_COS_BLOCK_SIZE - 1) / CUDA_COS_BLOCK_SIZE;
324
+ op_cos<<<num_blocks, CUDA_COS_BLOCK_SIZE, 0, stream>>>(x, dst, k);
325
+ }
326
+
327
+ template <class T>
328
+ static void log_cuda(const T * x, T * dst, const int k, cudaStream_t stream) {
329
+ const int num_blocks = (k + CUDA_COS_BLOCK_SIZE - 1) / CUDA_COS_BLOCK_SIZE;
330
+ op_log<<<num_blocks, CUDA_COS_BLOCK_SIZE, 0, stream>>>(x, dst, k);
331
+ }
332
+
333
+ void ggml_cuda_op_abs(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
334
+ const ggml_tensor * src0 = dst->src[0];
335
+ const void * src0_d = src0->data;
336
+ void * dst_d = dst->data;
337
+ cudaStream_t stream = ctx.stream();
338
+
339
+ GGML_ASSERT(ggml_is_contiguous(src0));
340
+
341
+ GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
342
+ GGML_ASSERT( dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
343
+ GGML_ASSERT(src0->type == dst->type);
344
+
345
+ if (src0->type == GGML_TYPE_F16) {
346
+ abs_cuda((const half *)src0_d, (half *)dst_d, ggml_nelements(src0), stream);
347
+ } else {
348
+ abs_cuda((const float *)src0_d, (float *)dst_d, ggml_nelements(src0), stream);
349
+ }
350
+ }
351
+
352
+ void ggml_cuda_op_sgn(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
353
+ const ggml_tensor * src0 = dst->src[0];
354
+ const void * src0_d = src0->data;
355
+ void * dst_d = dst->data;
356
+ cudaStream_t stream = ctx.stream();
357
+
358
+ GGML_ASSERT(ggml_is_contiguous(src0));
359
+
360
+ GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
361
+ GGML_ASSERT( dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
362
+ GGML_ASSERT(src0->type == dst->type);
363
+
364
+ if (src0->type == GGML_TYPE_F16) {
365
+ sgn_cuda((const half *)src0_d, (half *)dst_d, ggml_nelements(src0), stream);
366
+ } else {
367
+ sgn_cuda((const float *)src0_d, (float *)dst_d, ggml_nelements(src0), stream);
368
+ }
369
  }
370
 
371
  void ggml_cuda_op_neg(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
372
  const ggml_tensor * src0 = dst->src[0];
373
+ const void * src0_d = src0->data;
374
+ void * dst_d = dst->data;
375
  cudaStream_t stream = ctx.stream();
376
 
377
  GGML_ASSERT(ggml_is_contiguous(src0));
378
 
379
+ GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
380
+ GGML_ASSERT( dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
381
+ GGML_ASSERT(src0->type == dst->type);
382
 
383
+ if (src0->type == GGML_TYPE_F16) {
384
+ neg_cuda((const half *)src0_d, (half *)dst_d, ggml_nelements(src0), stream);
385
+ } else {
386
+ neg_cuda((const float *)src0_d, (float *)dst_d, ggml_nelements(src0), stream);
387
+ }
388
  }
389
 
390
  void ggml_cuda_op_step(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
391
  const ggml_tensor * src0 = dst->src[0];
392
+ const void * src0_d = src0->data;
393
+ void * dst_d = dst->data;
394
  cudaStream_t stream = ctx.stream();
395
 
396
  GGML_ASSERT(ggml_is_contiguous(src0));
397
 
398
+ GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
399
+ GGML_ASSERT( dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
400
+ GGML_ASSERT(src0->type == dst->type);
401
 
402
+ if (src0->type == GGML_TYPE_F16) {
403
+ step_cuda((const half *)src0_d, (half *)dst_d, ggml_nelements(src0), stream);
404
+ } else {
405
+ step_cuda((const float *)src0_d, (float *)dst_d, ggml_nelements(src0), stream);
406
+ }
407
  }
408
 
409
  void ggml_cuda_op_gelu(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
410
  const ggml_tensor * src0 = dst->src[0];
411
+ const void * src0_d = src0->data;
412
+ void * dst_d = dst->data;
413
  cudaStream_t stream = ctx.stream();
414
 
415
  GGML_ASSERT(ggml_is_contiguous(src0));
416
 
417
+ GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
418
+ GGML_ASSERT( dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
419
+ GGML_ASSERT(src0->type == dst->type);
420
 
421
+ if (src0->type == GGML_TYPE_F16) {
422
+ gelu_cuda((const half *)src0_d, (half *)dst_d, ggml_nelements(src0), stream);
423
+ } else {
424
+ gelu_cuda((const float *)src0_d, (float *)dst_d, ggml_nelements(src0), stream);
425
+ }
426
  }
427
 
428
  void ggml_cuda_op_silu(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
429
  const ggml_tensor * src0 = dst->src[0];
430
+ const void * src0_d = src0->data;
431
+ void * dst_d = dst->data;
432
  cudaStream_t stream = ctx.stream();
433
 
434
  GGML_ASSERT(ggml_is_contiguous(src0));
435
 
436
+ GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
437
+ GGML_ASSERT( dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
438
+ GGML_ASSERT(src0->type == dst->type);
439
 
440
+ if (src0->type == GGML_TYPE_F16) {
441
+ silu_cuda((const half *)src0_d, (half *)dst_d, ggml_nelements(src0), stream);
442
+ } else {
443
+ silu_cuda((const float *)src0_d, (float *)dst_d, ggml_nelements(src0), stream);
444
+ }
445
  }
446
 
447
  void ggml_cuda_op_silu_back(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
 
456
 
457
  GGML_ASSERT(ggml_is_contiguous(src0));
458
 
459
+ GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
460
+ GGML_ASSERT( dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
461
+ GGML_ASSERT(src0->type == dst->type);
462
 
463
+ if (src0->type == GGML_TYPE_F16) {
464
+ silu_back_cuda((const half *)src0_d, (const half *)src1_d, (half *)dst_d, ggml_nelements(src0), stream);
465
+ } else {
466
+ silu_back_cuda((const float*)src0_d, (const float*)src1_d, (float *)dst_d, ggml_nelements(src0), stream);
467
+ }
468
  }
469
 
470
  void ggml_cuda_op_gelu_quick(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
471
  const ggml_tensor * src0 = dst->src[0];
472
+ const void * src0_d = src0->data;
473
+ void * dst_d = dst->data;
474
  cudaStream_t stream = ctx.stream();
475
 
476
  GGML_ASSERT(ggml_is_contiguous(src0));
477
 
478
+ GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
479
+ GGML_ASSERT( dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
480
+ GGML_ASSERT(src0->type == dst->type);
481
 
482
+ if (src0->type == GGML_TYPE_F16) {
483
+ gelu_quick_cuda((const half *)src0_d, (half *)dst_d, ggml_nelements(src0), stream);
484
+ } else {
485
+ gelu_quick_cuda((const float *)src0_d, (float *)dst_d, ggml_nelements(src0), stream);
486
+ }
487
  }
488
 
489
  void ggml_cuda_op_tanh(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
490
  const ggml_tensor * src0 = dst->src[0];
491
+ const void * src0_d = src0->data;
492
+ void * dst_d = dst->data;
493
  cudaStream_t stream = ctx.stream();
494
 
495
  GGML_ASSERT(ggml_is_contiguous(src0));
496
 
497
+ GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
498
+ GGML_ASSERT( dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
499
+ GGML_ASSERT(src0->type == dst->type);
500
 
501
+ if (src0->type == GGML_TYPE_F16) {
502
+ tanh_cuda((const half *)src0_d, (half *)dst_d, ggml_nelements(src0), stream);
503
+ } else {
504
+ tanh_cuda((const float *)src0_d, (float *)dst_d, ggml_nelements(src0), stream);
505
+ }
506
  }
507
 
508
  void ggml_cuda_op_relu(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
509
  const ggml_tensor * src0 = dst->src[0];
510
+ const void * src0_d = src0->data;
511
+ void * dst_d = dst->data;
512
  cudaStream_t stream = ctx.stream();
513
 
514
  GGML_ASSERT(ggml_is_contiguous(src0));
515
 
516
+ GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
517
+ GGML_ASSERT( dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
518
+ GGML_ASSERT(src0->type == dst->type);
519
 
520
+ if (src0->type == GGML_TYPE_F16) {
521
+ relu_cuda((const half *)src0_d, (half *)dst_d, ggml_nelements(src0), stream);
522
+ } else {
523
+ relu_cuda((const float *)src0_d, (float *)dst_d, ggml_nelements(src0), stream);
524
+ }
525
  }
526
 
527
  void ggml_cuda_op_sigmoid(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
528
  const ggml_tensor * src0 = dst->src[0];
529
+ const void * src0_d = src0->data;
530
+ void * dst_d = dst->data;
531
  cudaStream_t stream = ctx.stream();
532
 
533
  GGML_ASSERT(ggml_is_contiguous(src0));
534
 
535
+ GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
536
+ GGML_ASSERT( dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
537
+ GGML_ASSERT(src0->type == dst->type);
538
 
539
+ if (src0->type == GGML_TYPE_F16) {
540
+ sigmoid_cuda((const half *)src0_d, (half *)dst_d, ggml_nelements(src0), stream);
541
+ } else {
542
+ sigmoid_cuda((const float *)src0_d, (float *)dst_d, ggml_nelements(src0), stream);
543
+ }
544
  }
545
 
546
  void ggml_cuda_op_hardsigmoid(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
547
  const ggml_tensor * src0 = dst->src[0];
548
+ const void * src0_d = src0->data;
549
+ void * dst_d = dst->data;
550
  cudaStream_t stream = ctx.stream();
551
 
552
  GGML_ASSERT(ggml_is_contiguous(src0));
553
 
554
+ GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
555
+ GGML_ASSERT( dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
556
+ GGML_ASSERT(src0->type == dst->type);
557
 
558
+ if (src0->type == GGML_TYPE_F16) {
559
+ hardsigmoid_cuda((const half *)src0_d, (half *)dst_d, ggml_nelements(src0), stream);
560
+ } else {
561
+ hardsigmoid_cuda((const float *)src0_d, (float *)dst_d, ggml_nelements(src0), stream);
562
+ }
563
  }
564
 
565
  void ggml_cuda_op_hardswish(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
566
  const ggml_tensor * src0 = dst->src[0];
567
+ const void * src0_d = src0->data;
568
+ void * dst_d = dst->data;
569
  cudaStream_t stream = ctx.stream();
570
 
571
  GGML_ASSERT(ggml_is_contiguous(src0));
572
 
573
+ GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
574
+ GGML_ASSERT( dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
575
+ GGML_ASSERT(src0->type == dst->type);
576
 
577
+ if (src0->type == GGML_TYPE_F16) {
578
+ hardswish_cuda((const half *)src0_d, (half *)dst_d, ggml_nelements(src0), stream);
579
+ } else {
580
+ hardswish_cuda((const float *)src0_d, (float *)dst_d, ggml_nelements(src0), stream);
581
+ }
582
  }
583
 
584
  void ggml_cuda_op_exp(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
585
  const ggml_tensor * src0 = dst->src[0];
586
+ const void * src0_d = src0->data;
587
+ void * dst_d = dst->data;
588
  cudaStream_t stream = ctx.stream();
589
 
590
  GGML_ASSERT(ggml_is_contiguous(src0));
591
 
592
+ GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
593
+ GGML_ASSERT( dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
594
+ GGML_ASSERT(src0->type == dst->type);
595
 
596
+ if (src0->type == GGML_TYPE_F16) {
597
+ exp_cuda((const half *)src0_d, (half *)dst_d, ggml_nelements(src0), stream);
598
+ } else {
599
+ exp_cuda((const float *)src0_d, (float *)dst_d, ggml_nelements(src0), stream);
600
+ }
601
  }
602
 
603
  void ggml_cuda_op_leaky_relu(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
604
  const ggml_tensor * src0 = dst->src[0];
605
+ const void * src0_d = src0->data;
606
+ void * dst_d = dst->data;
607
  cudaStream_t stream = ctx.stream();
608
 
609
  GGML_ASSERT(ggml_is_contiguous(src0));
610
 
611
+ GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
612
+ GGML_ASSERT( dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
613
+ GGML_ASSERT(src0->type == dst->type);
614
 
615
  float negative_slope;
616
  memcpy(&negative_slope, dst->op_params, sizeof(float));
617
 
618
+ if (src0->type == GGML_TYPE_F16) {
619
+ leaky_relu_cuda((const half *)src0_d, (half *)dst_d, ggml_nelements(src0), negative_slope, stream);
620
+ } else {
621
+ leaky_relu_cuda((const float *)src0_d, (float *)dst_d, ggml_nelements(src0), negative_slope, stream);
622
+ }
623
  }
624
 
625
  void ggml_cuda_op_sqr(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
626
  const ggml_tensor * src0 = dst->src[0];
627
+ const void * src0_d = src0->data;
628
+ void * dst_d = dst->data;
629
  cudaStream_t stream = ctx.stream();
630
 
631
  GGML_ASSERT(ggml_is_contiguous(src0));
632
 
633
+ GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
634
+ GGML_ASSERT( dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
635
+ GGML_ASSERT(src0->type == dst->type);
636
 
637
+ if (src0->type == GGML_TYPE_F16) {
638
+ sqr_cuda((const half *)src0_d, (half *)dst_d, ggml_nelements(src0), stream);
639
+ } else {
640
+ sqr_cuda((const float *)src0_d, (float *)dst_d, ggml_nelements(src0), stream);
641
+ }
642
  }
643
 
644
  void ggml_cuda_op_sqrt(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
645
  const ggml_tensor * src0 = dst->src[0];
646
+ const void * src0_d = src0->data;
647
+ void * dst_d = dst->data;
648
  cudaStream_t stream = ctx.stream();
649
 
650
  GGML_ASSERT(ggml_is_contiguous(src0));
651
 
652
+ GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
653
+ GGML_ASSERT( dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
654
+ GGML_ASSERT(src0->type == dst->type);
655
 
656
+ if (src0->type == GGML_TYPE_F16) {
657
+ sqrt_cuda((const half *)src0_d, (half *)dst_d, ggml_nelements(src0), stream);
658
+ } else {
659
+ sqrt_cuda((const float *)src0_d, (float *)dst_d, ggml_nelements(src0), stream);
660
+ }
661
  }
662
 
663
  void ggml_cuda_op_sin(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
664
  const ggml_tensor * src0 = dst->src[0];
665
+ const void * src0_d = src0->data;
666
+ void * dst_d = dst->data;
667
  cudaStream_t stream = ctx.stream();
668
 
669
  GGML_ASSERT(ggml_is_contiguous(src0));
670
 
671
+ GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
672
+ GGML_ASSERT( dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
673
+ GGML_ASSERT(src0->type == dst->type);
674
 
675
+ if (src0->type == GGML_TYPE_F16) {
676
+ sin_cuda((const half *)src0_d, (half *)dst_d, ggml_nelements(src0), stream);
677
+ } else {
678
+ sin_cuda((const float *)src0_d, (float *)dst_d, ggml_nelements(src0), stream);
679
+ }
680
  }
681
 
682
  void ggml_cuda_op_cos(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
683
  const ggml_tensor * src0 = dst->src[0];
684
+ const void * src0_d = src0->data;
685
+ void * dst_d = dst->data;
686
  cudaStream_t stream = ctx.stream();
687
 
688
  GGML_ASSERT(ggml_is_contiguous(src0));
689
 
690
+ GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
691
+ GGML_ASSERT( dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
692
+ GGML_ASSERT(src0->type == dst->type);
693
 
694
+ if (src0->type == GGML_TYPE_F16) {
695
+ cos_cuda((const half *)src0_d, (half *)dst_d, ggml_nelements(src0), stream);
696
+ } else {
697
+ cos_cuda((const float *)src0_d, (float *)dst_d, ggml_nelements(src0), stream);
698
+ }
699
+ }
700
+
701
+ void ggml_cuda_op_log(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
702
+ const ggml_tensor * src0 = dst->src[0];
703
+ const void * src0_d = src0->data;
704
+ void * dst_d = dst->data;
705
+ cudaStream_t stream = ctx.stream();
706
+
707
+ GGML_ASSERT(ggml_is_contiguous(src0));
708
+
709
+ GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
710
+ GGML_ASSERT( dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
711
+ GGML_ASSERT(src0->type == dst->type);
712
+
713
+ if (src0->type == GGML_TYPE_F16) {
714
+ log_cuda((const half *)src0_d, (half *)dst_d, ggml_nelements(src0), stream);
715
+ } else {
716
+ log_cuda((const float *)src0_d, (float *)dst_d, ggml_nelements(src0), stream);
717
+ }
718
  }
ggml/src/ggml-cuda/unary.cuh CHANGED
@@ -16,6 +16,10 @@
16
  #define CUDA_SIN_BLOCK_SIZE 256
17
  #define CUDA_COS_BLOCK_SIZE 256
18
 
 
 
 
 
19
  void ggml_cuda_op_neg(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
20
 
21
  void ggml_cuda_op_step(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
@@ -49,3 +53,5 @@ void ggml_cuda_op_sqrt(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
49
  void ggml_cuda_op_sin(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
50
 
51
  void ggml_cuda_op_cos(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
 
 
 
16
  #define CUDA_SIN_BLOCK_SIZE 256
17
  #define CUDA_COS_BLOCK_SIZE 256
18
 
19
+ void ggml_cuda_op_abs(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
20
+
21
+ void ggml_cuda_op_sgn(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
22
+
23
  void ggml_cuda_op_neg(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
24
 
25
  void ggml_cuda_op_step(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
 
53
  void ggml_cuda_op_sin(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
54
 
55
  void ggml_cuda_op_cos(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
56
+
57
+ void ggml_cuda_op_log(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
ggml/src/ggml-metal/ggml-metal.m CHANGED
@@ -1200,7 +1200,7 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex
1200
  case GGML_UNARY_OP_GELU_QUICK:
1201
  case GGML_UNARY_OP_SILU:
1202
  case GGML_UNARY_OP_ELU:
1203
- return ggml_is_contiguous(op->src[0]);
1204
  default:
1205
  return false;
1206
  }
@@ -1210,21 +1210,26 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex
1210
  case GGML_OP_TRANSPOSE:
1211
  case GGML_OP_PERMUTE:
1212
  case GGML_OP_CONCAT:
 
1213
  case GGML_OP_ADD:
1214
  case GGML_OP_SUB:
1215
- case GGML_OP_ACC:
1216
  case GGML_OP_MUL:
1217
  case GGML_OP_DIV:
 
 
1218
  case GGML_OP_REPEAT:
1219
  case GGML_OP_SCALE:
1220
- case GGML_OP_CLAMP:
1221
  case GGML_OP_CONV_TRANSPOSE_1D:
1222
  return true;
 
 
1223
  case GGML_OP_SQR:
1224
  case GGML_OP_SQRT:
1225
  case GGML_OP_SIN:
1226
  case GGML_OP_COS:
1227
- return ggml_is_contiguous(op->src[0]);
 
 
1228
  case GGML_OP_SUM_ROWS:
1229
  case GGML_OP_SOFT_MAX:
1230
  case GGML_OP_GROUP_NORM:
@@ -1254,10 +1259,11 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex
1254
  case GGML_OP_UPSCALE:
1255
  case GGML_OP_PAD:
1256
  case GGML_OP_PAD_REFLECT_1D:
1257
- case GGML_OP_ARANGE:
1258
  case GGML_OP_TIMESTEP_EMBEDDING:
1259
  case GGML_OP_ARGSORT:
1260
  case GGML_OP_LEAKY_RELU:
 
 
1261
  return true;
1262
  case GGML_OP_FLASH_ATTN_EXT:
1263
  if (op->src[1]->type != op->src[2]->type) {
 
1200
  case GGML_UNARY_OP_GELU_QUICK:
1201
  case GGML_UNARY_OP_SILU:
1202
  case GGML_UNARY_OP_ELU:
1203
+ return ggml_is_contiguous(op->src[0]) && op->src[0]->type == GGML_TYPE_F32;
1204
  default:
1205
  return false;
1206
  }
 
1210
  case GGML_OP_TRANSPOSE:
1211
  case GGML_OP_PERMUTE:
1212
  case GGML_OP_CONCAT:
1213
+ return true;
1214
  case GGML_OP_ADD:
1215
  case GGML_OP_SUB:
 
1216
  case GGML_OP_MUL:
1217
  case GGML_OP_DIV:
1218
+ return op->src[0]->type == GGML_TYPE_F32;
1219
+ case GGML_OP_ACC:
1220
  case GGML_OP_REPEAT:
1221
  case GGML_OP_SCALE:
 
1222
  case GGML_OP_CONV_TRANSPOSE_1D:
1223
  return true;
1224
+ case GGML_OP_CLAMP:
1225
+ return op->src[0]->type == GGML_TYPE_F32;
1226
  case GGML_OP_SQR:
1227
  case GGML_OP_SQRT:
1228
  case GGML_OP_SIN:
1229
  case GGML_OP_COS:
1230
+ return ggml_is_contiguous(op->src[0]) && op->src[0]->type == GGML_TYPE_F32;
1231
+ case GGML_OP_LOG:
1232
+ return false; // TODO: implement
1233
  case GGML_OP_SUM_ROWS:
1234
  case GGML_OP_SOFT_MAX:
1235
  case GGML_OP_GROUP_NORM:
 
1259
  case GGML_OP_UPSCALE:
1260
  case GGML_OP_PAD:
1261
  case GGML_OP_PAD_REFLECT_1D:
 
1262
  case GGML_OP_TIMESTEP_EMBEDDING:
1263
  case GGML_OP_ARGSORT:
1264
  case GGML_OP_LEAKY_RELU:
1265
+ return op->src[0]->type == GGML_TYPE_F32;
1266
+ case GGML_OP_ARANGE:
1267
  return true;
1268
  case GGML_OP_FLASH_ATTN_EXT:
1269
  if (op->src[1]->type != op->src[2]->type) {