0 Replies Latest reply on Nov 21, 2008 12:08 PM by jfkong

    instruction cos and sin


      According to IL documentation:

      Cosine (cos), Sine (sin)
      Instructions COS, SIN
      Syntax Function Opcode Syntax Description Range
      COS IL_OP_COS cos dst, src0 cosine, cos(src0.w) [-pi , pi]
      SIN IL_OP_SIN sin dst, src0 sine, sin(src0.w) [-pi , pi]
      Description Computes the trigonometric function of src0.w, where w is in radians. src0.w must be within
      the specified range for each function; otherwise, the results are undefined.
      The 32-bit floating
      point result is placed in all elements of dst. The maximum absolute error is 0.002.


      Instructions COS_VEC, SIN_VEC
      Syntax Function Opcode Syntax Description
      COS_VEC IL_OP_COS_VEC cos_vec dst, src0 cosine, cos(src0.xyzw)
      SIN_VEC IL_OP_SIN_VEC sin_vec dst, src0 sine, sin(src0.xyzw)
      Description Computes the trigonometric function of each element of src0. Each element of src0 must be
      in the range [-100*pi , 100* pi].
      The 32-bit floating point results are placed in the
      corresponding elements of dst. The maximum absolute error is 0.0008

      However I put brook+ kernel in GSA:

      kernel void sum(float a<>, float b<>, out float c<>
      c = cos(a) + b;

      The generated IL is: (Basically there is no range adjustment)

      ; l0 = (0.000000f 0.000000f 0.000000f 0.000000f)
      dcl_literal l0, 0x00000000, 0x00000000, 0x00000000, 0x00000000
      ; l1 = (0.000000f 0.000000f 0.000000f 0.000000f)
      dcl_literal l1, 0x00000001, 0x00000001, 0x00000001, 0x00000001
      ; l2 = (-1.#QNAN0f -1.#QNAN0f -1.#QNAN0f -1.#QNAN0f)
      dcl_literal l2, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF
      ; l3 = (1.#QNAN0f 1.#QNAN0f 1.#QNAN0f 1.#QNAN0f)
      dcl_literal l3, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF
      ; l4 = (1.#INF00f 1.#INF00f 1.#INF00f 1.#INF00f)
      dcl_literal l4, 0x7F800000, 0x7F800000, 0x7F800000, 0x7F800000
      ; l5 = (-0.000000f -0.000000f -0.000000f -0.000000f)
      dcl_literal l5, 0x80000000, 0x80000000, 0x80000000, 0x80000000
      ; l6 = (0.301030f 0.301030f 0.301030f 0.301030f)
      dcl_literal l6, 0x3E9A209B, 0x3E9A209B, 0x3E9A209B, 0x3E9A209B
      ; l7 = (0.693147f 0.693147f 0.693147f 0.693147f)
      dcl_literal l7, 0x3F317218, 0x3F317218, 0x3F317218, 0x3F317218
      ; l8 = (3.141593f 3.141593f 3.141593f 3.141593f)
      dcl_literal l8, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB
      ; l9 = (1.570796f 1.570796f 1.570796f 1.570796f)
      dcl_literal l9, 0x3FC90FDB, 0x3FC90FDB, 0x3FC90FDB, 0x3FC90FDB
      ; l10 = (0.000000f 0.000000f 0.000000f 0.000000f)
      dcl_literal l10, 0x00000003, 0x00000003, 0x00000003, 0x00000003
      ; l11 = (0.000000f 0.000000f 0.000000f 0.000000f)
      dcl_literal l11, 0x00000002, 0x00000002, 0x00000002, 0x00000002
      dcl_input_generic v0
      dcl_input_generic v1
      dcl_output_color o0
      mov r270.xy__, v0
      mov r271.xy__, v1
      call 36
      call 0
      func 0
          mov o0, r272
      func 2
          ieq r0.x___, r17.x000, l0.x000
          if_logicalnz r0.x000
              sample_l_resource(0)_sampler(0) r19, r18.xy00, r18.0000
          ieq r0.x___, r17.x000, l1.x000
          if_logicalnz r0.x000
              sample_l_resource(1)_sampler(0) r19, r18.xy00, r18.0000
          mov r16.x___, r19.x000
      func 35
          cos_vec r268.x___, r265.x000
          add r269.x___, r268.x000, r266.x000
          mov r267.x___, r269.x000
      func 36
          mov r17.x___, l0.x000
          mov r18.xy__, r270.xy00
          call 2
          mov r277.x___, r16.x000
          mov r273.x___, r277.x000
          mov r17.x___, l1.x000
          mov r18.xy__, r271.xy00
          call 2
          mov r278.x___, r16.x000
          mov r274.x___, r278.x000
          mov r265.x___, r273.x000
          mov r266.x___, r274.x000
          call 35
          mov r275.x___, r267.x000
          mov r276.x___, r275.x000
          mov r276._y__, l0.0x00
          mov r276.__z_, l0.00x0
          mov r276.___w, l0.000x
          mov r272, r276

      Any explanation?