; RUN: if [ %llvmver -lt 16 ]; then %opt < %s %loadEnzyme -enzyme -enzyme-preopt=false -mem2reg -simplifycfg -S | FileCheck %s --check-prefixes SHARED,MEMSET; fi
; RUN: if [ %llvmver -lt 16 ]; then %opt < %s %loadEnzyme -enzyme -enzyme-preopt=false -mem2reg -simplifycfg -S | FileCheck %s --check-prefixes SHARED,STORE; fi
; RUN: %opt < %s %newLoadEnzyme -enzyme-preopt=false -passes="enzyme,function(mem2reg,%simplifycfg)" -S | FileCheck %s --check-prefixes SHARED,MEMSET
; RUN: %opt < %s %newLoadEnzyme -enzyme-preopt=false -passes="enzyme,function(mem2reg,%simplifycfg)" -S | FileCheck %s --check-prefixes SHARED,STORE

declare void @__enzyme_autodiff(i8*, double*, double*, double*, double*)

declare void @llvm.memset.p0i8.i64(i8* nocapture writeonly, i8, i64, i1)

define void @f(double* %x, double* %y) {
  %yptr = bitcast double* %y to i8*
  call void @llvm.memset.p0i8.i64(i8* %yptr, i8 0, i64 8, i1 false)
  %x1 = load double, double* %x, align 8
  %y1 = load double, double* %y, align 8
  %mul = fmul double %x1, %y1
  store double %mul, double* %x, align 8
  ret void
}

define void @g(double* %x, double* %y) {
  %yptr = bitcast double* %y to i8*
  store double 0.0, double* %y, align 8
  %x1 = load double, double* %x, align 8
  %y1 = load double, double* %y, align 8
  %mul = fmul double %x1, %y1
  store double %mul, double* %x, align 8
  ret void
}

define void @df(double* %x, double* %xp, double* %y, double* %dy) {
  tail call void @__enzyme_autodiff(i8* bitcast (void (double*, double*)* @f to i8*), double* %x, double* %xp, double* %y, double* %dy)
  tail call void @__enzyme_autodiff(i8* bitcast (void (double*, double*)* @g to i8*), double* %x, double* %xp, double* %y, double* %dy)
  ret void
}


; MEMSET: define internal void @diffef(double* %x, double* %"x'", double* %y, double* %"y'")
; STORE:  define internal void @diffeg(double* %x, double* %"x'", double* %y, double* %"y'")
; SHARED-NEXT: invert:
; MEMSET-NEXT:   %"yptr'ipc" = bitcast double* %"y'" to i8*
; MEMSET-NEXT:   %yptr = bitcast double* %y to i8*
; MEMSET-NEXT:   call void @llvm.memset.p0i8.i64(i8* %yptr, i8 0, i64 8, i1 false)
; STORE-NEXT:    store double 0.000000e+00, double* %y, align 8
; SHARED-NEXT:   %x1 = load double, double* %x, align 8
; SHARED-NEXT:   %y1 = load double, double* %y, align 8
; SHARED-NEXT:   %mul = fmul double %x1, %y1
; SHARED-NEXT:   store double %mul, double* %x, align 8
; SHARED-NEXT:   %0 = load double, double* %"x'", align 8
; SHARED-NEXT:   store double 0.000000e+00, double* %"x'", align 8
; SHARED-NEXT:   %1 = fadd fast double 0.000000e+00, %0
; SHARED-NEXT:   %[[m0diffex1:.+]] = fmul fast double %1, %y1
; SHARED-NEXT:   %[[i2:.+]] = fadd fast double 0.000000e+00, %[[m0diffex1]]
; SHARED-NEXT:   %[[m1diffey1:.+]] = fmul fast double %1, %x1
; SHARED-NEXT:   %[[i3:.+]] = fadd fast double 0.000000e+00, %[[m1diffey1]]
; SHARED-NEXT:   %[[i4:.+]] = load double, double* %"y'", align 8
; SHARED-NEXT:   %[[i5:.+]] = fadd fast double %[[i4]], %[[i3]]
; SHARED-NEXT:   store double %[[i5]], double* %"y'", align 8
; SHARED-NEXT:   %[[i6:.+]] = load double, double* %"x'", align 8
; SHARED-NEXT:   %[[i7:.+]] = fadd fast double %[[i6]], %[[i2]]
; SHARED-NEXT:   store double %[[i7]], double* %"x'", align 8
; MEMSET-NEXT:   call void @llvm.memset.p0i8.i64(i8* %"yptr'ipc", i8 0, i64 8, i1 false)
; STORE-NEXT:    store double 0.000000e+00, double* %"y'", align 8
; SHARED-NEXT:   ret void
; SHARED-NEXT: }