A   s%      `      i;Aug 14 202405:08:24HOST64sm_86







.version 6.3
.target sm_75
.address_size 64


.extern .shared .align 4 .b8 smatrix[];

.visible .entry _Z10matrix_gpuPtPf15red_uni60000000(
.param .u64 _Z10matrix_gpuPtPf15red_uni60000000_param_0,
.param .u64 _Z10matrix_gpuPtPf15red_uni60000000_param_1,
.param .align 2 .b8 _Z10matrix_gpuPtPf15red_uni60000000_param_2[168]
)
{
.reg .pred %p<22>;
.reg .b16 %rs<4>;
.reg .f32 %f<206>;
.reg .b32 %r<40>;
.reg .b64 %rd<34>;


ld.param.u64 %rd3, [_Z10matrix_gpuPtPf15red_uni60000000_param_0];
mov.b64	%rd4, _Z10matrix_gpuPtPf15red_uni60000000_param_2;
mov.u64 %rd1, %rd4;
mov.u32 %r1, %ntid.x;
mov.u32 %r2, %tid.y;
mov.u32 %r3, %tid.x;
mad.lo.s32 %r4, %r1, %r2, %r3;
setp.gt.s32	%p1, %r4, 25;
@%p1 bra BB0_2;

mul.wide.s32 %rd5, %r4, 4;
add.s64 %rd6, %rd1, %rd5;
ld.param.f32 %f69, [%rd6+64];
shl.b32 %r12, %r4, 2;
mov.u32 %r13, smatrix;
add.s32 %r14, %r13, %r12;
st.shared.f32 [%r14], %f69;

BB0_2:
bar.sync 0;
mov.u32 %r15, %ctaid.x;
mad.lo.s32 %r5, %r15, %r1, %r3;
mov.u32 %r16, %ntid.y;
mov.u32 %r17, %ctaid.y;
mad.lo.s32 %r6, %r16, %r17, %r2;
ld.param.u32 %r7, [%rd1+12];
setp.ge.s32	%p2, %r6, %r7;
@%p2 bra BB0_36;

ld.param.u32 %r8, [%rd1+8];
setp.ge.s32	%p3, %r5, %r8;
@%p3 bra BB0_36;

cvta.to.global.u64 %rd7, %rd3;
mul.lo.s32 %r18, %r8, %r7;
mad.lo.s32 %r19, %r8, %r6, %r5;
mul.wide.s32 %rd8, %r18, 2;
add.s64 %rd9, %rd7, %rd8;
add.s32 %r20, %r18, %r18;
mul.wide.s32 %rd10, %r20, 2;
add.s64 %rd11, %rd7, %rd10;
ld.param.u32 %r9, [%rd1+40];
setp.eq.s32	%p4, %r9, 0;
mul.lo.s32 %r21, %r19, 3;
selp.b32	%r22, %r19, %r21, %p4;
add.s64 %rd12, %rd7, 2;
selp.b64	%rd13, %rd9, %rd12, %p4;
add.s64 %rd14, %rd7, 4;
selp.b64	%rd15, %rd11, %rd14, %p4;
mul.wide.s32 %rd16, %r22, 2;
add.s64 %rd17, %rd7, %rd16;
ld.global.u16 %rs1, [%rd17];
cvt.rn.f32.u16	%f1, %rs1;
add.s64 %rd18, %rd13, %rd16;
ld.global.u16 %rs2, [%rd18];
cvt.rn.f32.u16	%f2, %rs2;
add.s64 %rd19, %rd15, %rd16;
ld.global.u16 %rs3, [%rd19];
cvt.rn.f32.u16	%f189, %rs3;
ld.param.u32 %r10, [%rd1+24];
ld.param.u32 %r11, [%rd1+4];
ld.param.u32 %r23, [%rd1+20];
setp.eq.s32	%p5, %r23, 1;
mov.f32 %f70, 0f477FFF00;
div.approx.ftz.f32 %f4, %f2, %f70;
@%p5 bra BB0_6;
bra.uni BB0_5;

BB0_6:
add.ftz.f32 %f72, %f1, 0fC7000000;
fma.rn.ftz.f32 %f73, %f72, 0f40000000, %f2;
min.ftz.f32 %f75, %f73, %f70;
mov.f32 %f76, 0f00000000;
max.ftz.f32 %f77, %f76, %f75;
div.approx.ftz.f32 %f190, %f77, %f70;
add.ftz.f32 %f78, %f189, 0fC7000000;
fma.rn.ftz.f32 %f79, %f78, 0f40000000, %f2;
min.ftz.f32 %f80, %f79, %f70;
max.ftz.f32 %f189, %f76, %f80;
bra.uni BB0_7;

BB0_5:
div.approx.ftz.f32 %f190, %f1, %f70;

BB0_7:
div.approx.ftz.f32 %f82, %f189, %f70;
mul.ftz.f32 %f83, %f82, %f82;
mul.ftz.f32 %f84, %f190, %f190;
setp.eq.s32	%p6, %r10, 1;
selp.f32	%f10, %f84, %f190, %p6;
mul.ftz.f32 %f85, %f4, %f4;
selp.f32	%f11, %f85, %f4, %p6;
selp.f32	%f12, %f83, %f82, %p6;
setp.lt.s32	%p7, %r11, 3;
@%p7 bra BB0_11;
bra.uni BB0_8;

BB0_11:
ld.shared.f32 %f97, [smatrix+60];
mov.f32 %f98, 0f3F800000;
sub.ftz.f32 %f99, %f98, %f97;
ld.shared.f32 %f100, [smatrix+48];
min.ftz.f32 %f101, %f10, %f100;
mul.ftz.f32 %f102, %f97, %f101;
fma.rn.ftz.f32 %f194, %f10, %f99, %f102;
ld.shared.f32 %f103, [smatrix+52];
min.ftz.f32 %f104, %f11, %f103;
mul.ftz.f32 %f105, %f104, %f97;
fma.rn.ftz.f32 %f195, %f11, %f99, %f105;
ld.shared.f32 %f106, [smatrix+56];
min.ftz.f32 %f107, %f12, %f106;
mul.ftz.f32 %f108, %f97, %f107;
fma.rn.ftz.f32 %f196, %f12, %f99, %f108;
bra.uni BB0_12;

BB0_8:
ld.shared.f32 %f13, [smatrix+48];
div.approx.ftz.f32 %f191, %f10, %f13;
ld.shared.f32 %f15, [smatrix+52];
div.approx.ftz.f32 %f192, %f11, %f15;
ld.shared.f32 %f17, [smatrix+56];
div.approx.ftz.f32 %f193, %f12, %f17;
min.ftz.f32 %f87, %f191, %f70;
mov.f32 %f88, 0f00000000;
max.ftz.f32 %f19, %f88, %f87;
min.ftz.f32 %f89, %f192, %f70;
max.ftz.f32 %f20, %f88, %f89;
min.ftz.f32 %f90, %f193, %f70;
max.ftz.f32 %f21, %f88, %f90;
max.ftz.f32 %f91, %f20, %f21;
max.ftz.f32 %f22, %f19, %f91;
setp.leu.ftz.f32	%p8, %f22, 0f00000000;
@%p8 bra BB0_10;

max.ftz.f32 %f92, %f192, %f193;
max.ftz.f32 %f93, %f191, %f92;
div.approx.ftz.f32 %f94, %f19, %f22;
mul.ftz.f32 %f191, %f93, %f94;
div.approx.ftz.f32 %f95, %f20, %f22;
mul.ftz.f32 %f192, %f93, %f95;
div.approx.ftz.f32 %f96, %f21, %f22;
mul.ftz.f32 %f193, %f93, %f96;

BB0_10:
mul.ftz.f32 %f194, %f13, %f191;
mul.ftz.f32 %f195, %f15, %f192;
mul.ftz.f32 %f196, %f17, %f193;

BB0_12:
ld.shared.f32 %f109, [smatrix];
ld.shared.f32 %f110, [smatrix+4];
mul.ftz.f32 %f111, %f195, %f110;
fma.rn.ftz.f32 %f112, %f194, %f109, %f111;
ld.shared.f32 %f113, [smatrix+8];
fma.rn.ftz.f32 %f114, %f196, %f113, %f112;
ld.shared.f32 %f115, [smatrix+12];
add.ftz.f32 %f116, %f115, %f114;
ld.shared.f32 %f117, [smatrix+16];
ld.shared.f32 %f118, [smatrix+20];
mul.ftz.f32 %f119, %f195, %f118;
fma.rn.ftz.f32 %f120, %f194, %f117, %f119;
ld.shared.f32 %f121, [smatrix+24];
fma.rn.ftz.f32 %f122, %f196, %f121, %f120;
ld.shared.f32 %f123, [smatrix+28];
add.ftz.f32 %f124, %f123, %f122;
ld.shared.f32 %f125, [smatrix+32];
ld.shared.f32 %f126, [smatrix+36];
mul.ftz.f32 %f127, %f195, %f126;
fma.rn.ftz.f32 %f128, %f194, %f125, %f127;
ld.shared.f32 %f129, [smatrix+40];
fma.rn.ftz.f32 %f130, %f196, %f129, %f128;
ld.shared.f32 %f131, [smatrix+44];
add.ftz.f32 %f132, %f131, %f130;
fma.rn.ftz.f32 %f200, %f116, 0f477FFF00, 0fC2800000;
fma.rn.ftz.f32 %f201, %f124, 0f477FFF00, 0fC2800000;
fma.rn.ftz.f32 %f202, %f132, 0f477FFF00, 0fC2800000;
setp.gt.ftz.f32	%p9, %f200, %f201;
selp.f32	%f133, %f200, %f201, %p9;
setp.gt.ftz.f32	%p10, %f133, %f202;
selp.f32	%f41, %f133, %f202, %p10;
setp.le.ftz.f32	%p11, %f41, 0f00000000;
@%p11 bra BB0_26;

div.approx.ftz.f32 %f42, %f200, %f41;
div.approx.ftz.f32 %f43, %f201, %f41;
div.approx.ftz.f32 %f44, %f202, %f41;
setp.lt.ftz.f32	%p12, %f42, 0fBF800000;
mov.f32 %f199, 0f00000000;
mov.f32 %f197, %f199;
@%p12 bra BB0_17;

setp.geu.ftz.f32	%p13, %f42, 0f3D4CCCCD;
@%p13 bra BB0_15;

add.ftz.f32 %f135, %f42, 0f3F800000;
mov.f32 %f136, 0f3F866666;
div.approx.ftz.f32 %f137, %f135, %f136;
lg2.approx.ftz.f32 %f138, %f137;
mov.f32 %f139, 0f3D4CCCCD;
div.approx.ftz.f32 %f140, %f136, %f139;
mul.ftz.f32 %f141, %f140, %f138;
ex2.approx.ftz.f32 %f142, %f141;
mul.ftz.f32 %f197, %f142, 0f3D4CCCCD;
bra.uni BB0_17;

BB0_15:
mov.f32 %f197, %f42;

BB0_17:
setp.lt.ftz.f32	%p14, %f43, 0fBF800000;
mov.f32 %f198, %f199;
@%p14 bra BB0_21;

setp.geu.ftz.f32	%p15, %f43, 0f3D4CCCCD;
@%p15 bra BB0_19;

add.ftz.f32 %f144, %f43, 0f3F800000;
mov.f32 %f145, 0f3F866666;
div.approx.ftz.f32 %f146, %f144, %f145;
lg2.approx.ftz.f32 %f147, %f146;
mov.f32 %f148, 0f3D4CCCCD;
div.approx.ftz.f32 %f149, %f145, %f148;
mul.ftz.f32 %f150, %f149, %f147;
ex2.approx.ftz.f32 %f151, %f150;
mul.ftz.f32 %f198, %f151, 0f3D4CCCCD;
bra.uni BB0_21;

BB0_19:
mov.f32 %f198, %f43;

BB0_21:
setp.lt.ftz.f32	%p16, %f44, 0fBF800000;
@%p16 bra BB0_25;

setp.geu.ftz.f32	%p17, %f44, 0f3D4CCCCD;
@%p17 bra BB0_23;

add.ftz.f32 %f153, %f44, 0f3F800000;
mov.f32 %f154, 0f3F866666;
div.approx.ftz.f32 %f155, %f153, %f154;
lg2.approx.ftz.f32 %f156, %f155;
mov.f32 %f157, 0f3D4CCCCD;
div.approx.ftz.f32 %f158, %f154, %f157;
mul.ftz.f32 %f159, %f158, %f156;
ex2.approx.ftz.f32 %f160, %f159;
mul.ftz.f32 %f199, %f160, 0f3D4CCCCD;
bra.uni BB0_25;

BB0_23:
mov.f32 %f199, %f44;

BB0_25:
mul.ftz.f32 %f200, %f41, %f197;
mul.ftz.f32 %f201, %f41, %f198;
mul.ftz.f32 %f202, %f41, %f199;

BB0_26:
ld.param.f32 %f161, [%rd1+48];
mul.ftz.f32 %f162, %f200, %f161;
mul.ftz.f32 %f163, %f201, %f161;
mul.ftz.f32 %f164, %f202, %f161;
div.approx.ftz.f32 %f166, %f162, %f70;
div.approx.ftz.f32 %f167, %f163, %f70;
div.approx.ftz.f32 %f168, %f164, %f70;
add.ftz.f32 %f57, %f166, 0f3C23D70A;
add.ftz.f32 %f58, %f167, 0f3C23D70A;
add.ftz.f32 %f59, %f168, 0f3C23D70A;
setp.lt.ftz.f32	%p18, %f57, 0f00000000;
@%p18 bra BB0_28;
bra.uni BB0_27;

BB0_28:
mul.ftz.f32 %f203, %f57, 0f4173153F;
bra.uni BB0_29;

BB0_27:
fma.rn.ftz.f32 %f169, %f57, 0f431BF9AF, 0f3F800000;
lg2.approx.ftz.f32 %f170, %f169;
mul.ftz.f32 %f171, %f170, 0f3E9A209B;
mul.ftz.f32 %f203, %f171, 0f3E65AA2E;

BB0_29:
setp.lt.ftz.f32	%p19, %f58, 0f00000000;
@%p19 bra BB0_31;
bra.uni BB0_30;

BB0_31:
mul.ftz.f32 %f204, %f58, 0f4173153F;
bra.uni BB0_32;

BB0_30:
fma.rn.ftz.f32 %f172, %f58, 0f431BF9AF, 0f3F800000;
lg2.approx.ftz.f32 %f173, %f172;
mul.ftz.f32 %f174, %f173, 0f3E9A209B;
mul.ftz.f32 %f204, %f174, 0f3E65AA2E;

BB0_32:
setp.lt.ftz.f32	%p20, %f59, 0f00000000;
@%p20 bra BB0_34;
bra.uni BB0_33;

BB0_34:
mul.ftz.f32 %f205, %f59, 0f4173153F;
bra.uni BB0_35;

BB0_33:
fma.rn.ftz.f32 %f175, %f59, 0f431BF9AF, 0f3F800000;
lg2.approx.ftz.f32 %f176, %f175;
mul.ftz.f32 %f177, %f176, 0f3E9A209B;
mul.ftz.f32 %f205, %f177, 0f3E65AA2E;

BB0_35:
fma.rn.ftz.f32 %f178, %f203, 0f477FFF00, 0f3F000000;
min.ftz.f32 %f180, %f178, %f70;
mov.f32 %f181, 0f00000000;
max.ftz.f32 %f182, %f181, %f180;
cvt.rzi.ftz.u32.f32	%r24, %f182;
st.global.u16 [%rd17], %r24;
fma.rn.ftz.f32 %f183, %f204, 0f477FFF00, 0f3F000000;
min.ftz.f32 %f184, %f183, %f70;
max.ftz.f32 %f185, %f181, %f184;
cvt.rzi.ftz.u32.f32	%r36, %f185;
st.global.u16 [%rd18], %r36;
fma.rn.ftz.f32 %f186, %f205, 0f477FFF00, 0f3F000000;
min.ftz.f32 %f187, %f186, %f70;
max.ftz.f32 %f188, %f181, %f187;
cvt.rzi.ftz.u32.f32	%r38, %f188;
st.global.u16 [%rd19], %r38;

BB0_36:
ret;
}


  ELF3         ~                        VK @ 8  @    .shstrtab .strtab .symtab .symtab_shndx .nv.info .text._Z10matrix_gpuPtPf15red_uni60000000 .nv.info._Z10matrix_gpuPtPf15red_uni60000000 .nv.shared._Z10matrix_gpuPtPf15red_uni60000000 .nv.constant2._Z10matrix_gpuPtPf15red_uni60000000 .nv.constant0._Z10matrix_gpuPtPf15red_uni60000000 .rel.nv.constant0._Z10matrix_gpuPtPf15red_uni60000000 .debug_frame .rel.debug_frame .rela.debug_frame .nv.callgraph .nv.prototype .nv.rel.action  .shstrtab .strtab .symtab .symtab_shndx .nv.info .text._Z10matrix_gpuPtPf15red_uni60000000 .nv.info._Z10matrix_gpuPtPf15red_uni60000000 .nv.shared._Z10matrix_gpuPtPf15red_uni60000000 .nv.constant2._Z10matrix_gpuPtPf15red_uni60000000 .rel.nv.constant0._Z10matrix_gpuPtPf15red_uni60000000 .nv.constant0._Z10matrix_gpuPtPf15red_uni60000000 .debug_frame .rel.debug_frame .rela.debug_frame .nv.callgraph .nv.prototype .nv.rel.action _Z10matrix_gpuPtPf15red_uni60000000                           2                         
                                      R                                                                              $        |( ((   4                                 <   ( L         /                       7 ~   5  
    `                  !           !  D @          0                       s          % 6D                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    z  
       y      "   ( y      !   ( y      &   h y      %    $z       $z    / x     pB  $z     O z  _  pb           $       h      $       H    {           M          z  ^  pb  M	          z  ^       $z	 ^    z  f  pR  z  F   
    z  X       $z _    x         x     r    %v X    z Y     z X    %v X    			     r        r        r        z Y     %x	     r        %v	 X   y
      %x	     y       y     " z	  a       x 	   pR  z  b       x    pR  s
 
      0N s       ps          !
        #   @   / 	 G    !	        O 	
       z  ]       #		   @    x    pb  			 G    			          7  A    x  7  A    x
 7  A    	 7  A            A         A         A   G         y 0      " Ey        s 	       0 s 
       p s          r      A   	x G     r     A  / 	r       	x G     r     A  O 	r       	x  G    	r        	r        	r        r     @  G         s        " 	r       	r        r      A   r     A   r     A    r      A   r     A   r     A   Ay           r     A    r 	     A    r
     A   Gy         y 0      $ 	r       !t  ?    	r 	       	r

       r	     A   r     A   r

     A   #r   	   #r        #r   
    y        " Ey  p      y       h y          r	 	     A   r      A  / #r   	   #r        r      A  O #r
      x   G     #r      #r       !r        !r        #r       #t       #t
       !r        r 
    @  #t       r
         r     @  r         r       G        s        " Ey         r     A   r
     A   r     A   r         x      x      x      r         G         x L=   !  ?      >s?  A           $  	  A  A   	 	       $  	L=  A   "         Ay         Ey        r
         G         x L=   !  ?      >s?  A           $  	  A  A   	 	       $  
	L=  A   
         Ay         Ey        G         x L=   !  ?      >s?  A           $  	  A  A   	 	       $  	L=  A            Ay          r     A   r

     A   r     A   Ay          x  7      z h    A    z

 h    A    z h    A   #t
#<   #t

#<   #t
#<    r       r       r        C     	 C     ( C     #  ?    #
  ?	    #$  ?            0 
 
       p #          	 >  A    		.e>  A    	?sA  A    
 >  A  / #t		   ?      .e>  A    ?sA  A   	x		 G     ( >  A  O #t   ?     	r		        (.e>  A    ?sA  A   	x G    #t   ?     	r       s	 	    !  & 	x G    	r       s     !  p s     !   y 	     y     / y     O My          Gy    y            y            y            y            y            y            y            y            y            y            y                                                                                                  @                                                                                                                                                    R                           p                              )      p                      $                             \      p@                     t                                p                                                       p                                                   _  	   @                                                      B                                                        B                                                 2                                                            C                                                                                                                                 8      8                                                                                                                  