; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer -S | FileCheck %s --check-prefix=SSE
; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer -S | FileCheck %s --check-prefix=SLM
; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -mattr=+prefer-128-bit -passes=slp-vectorizer -S | FileCheck %s --check-prefix=AVX128
; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -mattr=-prefer-128-bit -passes=slp-vectorizer -S | FileCheck %s --check-prefix=AVX256
; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -mattr=+prefer-128-bit -passes=slp-vectorizer -S | FileCheck %s --check-prefix=AVX128
; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -mattr=-prefer-128-bit -passes=slp-vectorizer -S | FileCheck %s --check-prefix=AVX256
; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer -S | FileCheck %s --check-prefix=AVX512
; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=-prefer-256-bit -passes=slp-vectorizer -S | FileCheck %s --check-prefix=AVX512
; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=+prefer-256-bit -passes=slp-vectorizer -S | FileCheck %s --check-prefix=AVX256

@a64 = common global [8 x i64] zeroinitializer, align 64
@b64 = common global [8 x i64] zeroinitializer, align 64
@c64 = common global [8 x i64] zeroinitializer, align 64
@a32 = common global [16 x i32] zeroinitializer, align 64
@b32 = common global [16 x i32] zeroinitializer, align 64
@c32 = common global [16 x i32] zeroinitializer, align 64
@a16 = common global [32 x i16] zeroinitializer, align 64
@b16 = common global [32 x i16] zeroinitializer, align 64
@c16 = common global [32 x i16] zeroinitializer, align 64
@a8  = common global [64 x i8] zeroinitializer, align 64
@b8  = common global [64 x i8] zeroinitializer, align 64
@c8  = common global [64 x i8] zeroinitializer, align 64

define void @mul_v8i64() {
; SSE-LABEL: @mul_v8i64(
; SSE-NEXT:    [[A0:%.*]] = load i64, ptr @a64, align 8
; SSE-NEXT:    [[A1:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 1), align 8
; SSE-NEXT:    [[A2:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 2), align 8
; SSE-NEXT:    [[A3:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 3), align 8
; SSE-NEXT:    [[A4:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 4), align 8
; SSE-NEXT:    [[A5:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 5), align 8
; SSE-NEXT:    [[A6:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 6), align 8
; SSE-NEXT:    [[A7:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 7), align 8
; SSE-NEXT:    [[B0:%.*]] = load i64, ptr @b64, align 8
; SSE-NEXT:    [[B1:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 1), align 8
; SSE-NEXT:    [[B2:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 2), align 8
; SSE-NEXT:    [[B3:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 3), align 8
; SSE-NEXT:    [[B4:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 4), align 8
; SSE-NEXT:    [[B5:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 5), align 8
; SSE-NEXT:    [[B6:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 6), align 8
; SSE-NEXT:    [[B7:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 7), align 8
; SSE-NEXT:    [[R0:%.*]] = mul i64 [[A0]], [[B0]]
; SSE-NEXT:    [[R1:%.*]] = mul i64 [[A1]], [[B1]]
; SSE-NEXT:    [[R2:%.*]] = mul i64 [[A2]], [[B2]]
; SSE-NEXT:    [[R3:%.*]] = mul i64 [[A3]], [[B3]]
; SSE-NEXT:    [[R4:%.*]] = mul i64 [[A4]], [[B4]]
; SSE-NEXT:    [[R5:%.*]] = mul i64 [[A5]], [[B5]]
; SSE-NEXT:    [[R6:%.*]] = mul i64 [[A6]], [[B6]]
; SSE-NEXT:    [[R7:%.*]] = mul i64 [[A7]], [[B7]]
; SSE-NEXT:    store i64 [[R0]], ptr @c64, align 8
; SSE-NEXT:    store i64 [[R1]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 1), align 8
; SSE-NEXT:    store i64 [[R2]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 2), align 8
; SSE-NEXT:    store i64 [[R3]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 3), align 8
; SSE-NEXT:    store i64 [[R4]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 4), align 8
; SSE-NEXT:    store i64 [[R5]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 5), align 8
; SSE-NEXT:    store i64 [[R6]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 6), align 8
; SSE-NEXT:    store i64 [[R7]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 7), align 8
; SSE-NEXT:    ret void
;
; SLM-LABEL: @mul_v8i64(
; SLM-NEXT:    [[A0:%.*]] = load i64, ptr @a64, align 8
; SLM-NEXT:    [[A1:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 1), align 8
; SLM-NEXT:    [[A2:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 2), align 8
; SLM-NEXT:    [[A3:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 3), align 8
; SLM-NEXT:    [[A4:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 4), align 8
; SLM-NEXT:    [[A5:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 5), align 8
; SLM-NEXT:    [[A6:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 6), align 8
; SLM-NEXT:    [[A7:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 7), align 8
; SLM-NEXT:    [[B0:%.*]] = load i64, ptr @b64, align 8
; SLM-NEXT:    [[B1:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 1), align 8
; SLM-NEXT:    [[B2:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 2), align 8
; SLM-NEXT:    [[B3:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 3), align 8
; SLM-NEXT:    [[B4:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 4), align 8
; SLM-NEXT:    [[B5:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 5), align 8
; SLM-NEXT:    [[B6:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 6), align 8
; SLM-NEXT:    [[B7:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 7), align 8
; SLM-NEXT:    [[R0:%.*]] = mul i64 [[A0]], [[B0]]
; SLM-NEXT:    [[R1:%.*]] = mul i64 [[A1]], [[B1]]
; SLM-NEXT:    [[R2:%.*]] = mul i64 [[A2]], [[B2]]
; SLM-NEXT:    [[R3:%.*]] = mul i64 [[A3]], [[B3]]
; SLM-NEXT:    [[R4:%.*]] = mul i64 [[A4]], [[B4]]
; SLM-NEXT:    [[R5:%.*]] = mul i64 [[A5]], [[B5]]
; SLM-NEXT:    [[R6:%.*]] = mul i64 [[A6]], [[B6]]
; SLM-NEXT:    [[R7:%.*]] = mul i64 [[A7]], [[B7]]
; SLM-NEXT:    store i64 [[R0]], ptr @c64, align 8
; SLM-NEXT:    store i64 [[R1]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 1), align 8
; SLM-NEXT:    store i64 [[R2]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 2), align 8
; SLM-NEXT:    store i64 [[R3]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 3), align 8
; SLM-NEXT:    store i64 [[R4]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 4), align 8
; SLM-NEXT:    store i64 [[R5]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 5), align 8
; SLM-NEXT:    store i64 [[R6]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 6), align 8
; SLM-NEXT:    store i64 [[R7]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 7), align 8
; SLM-NEXT:    ret void
;
; AVX128-LABEL: @mul_v8i64(
; AVX128-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @a64, align 8
; AVX128-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr @b64, align 8
; AVX128-NEXT:    [[TMP3:%.*]] = mul <2 x i64> [[TMP1]], [[TMP2]]
; AVX128-NEXT:    store <2 x i64> [[TMP3]], ptr @c64, align 8
; AVX128-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 2), align 8
; AVX128-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 2), align 8
; AVX128-NEXT:    [[TMP6:%.*]] = mul <2 x i64> [[TMP4]], [[TMP5]]
; AVX128-NEXT:    store <2 x i64> [[TMP6]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 2), align 8
; AVX128-NEXT:    [[TMP7:%.*]] = load <2 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 4), align 8
; AVX128-NEXT:    [[TMP8:%.*]] = load <2 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 4), align 8
; AVX128-NEXT:    [[TMP9:%.*]] = mul <2 x i64> [[TMP7]], [[TMP8]]
; AVX128-NEXT:    store <2 x i64> [[TMP9]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 4), align 8
; AVX128-NEXT:    [[TMP10:%.*]] = load <2 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 6), align 8
; AVX128-NEXT:    [[TMP11:%.*]] = load <2 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 6), align 8
; AVX128-NEXT:    [[TMP12:%.*]] = mul <2 x i64> [[TMP10]], [[TMP11]]
; AVX128-NEXT:    store <2 x i64> [[TMP12]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 6), align 8
; AVX128-NEXT:    ret void
;
; AVX256-LABEL: @mul_v8i64(
; AVX256-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @a64, align 8
; AVX256-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr @b64, align 8
; AVX256-NEXT:    [[TMP3:%.*]] = mul <4 x i64> [[TMP1]], [[TMP2]]
; AVX256-NEXT:    store <4 x i64> [[TMP3]], ptr @c64, align 8
; AVX256-NEXT:    [[TMP4:%.*]] = load <4 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 4), align 8
; AVX256-NEXT:    [[TMP5:%.*]] = load <4 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 4), align 8
; AVX256-NEXT:    [[TMP6:%.*]] = mul <4 x i64> [[TMP4]], [[TMP5]]
; AVX256-NEXT:    store <4 x i64> [[TMP6]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 4), align 8
; AVX256-NEXT:    ret void
;
; AVX512-LABEL: @mul_v8i64(
; AVX512-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @a64, align 8
; AVX512-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr @b64, align 8
; AVX512-NEXT:    [[TMP3:%.*]] = mul <8 x i64> [[TMP1]], [[TMP2]]
; AVX512-NEXT:    store <8 x i64> [[TMP3]], ptr @c64, align 8
; AVX512-NEXT:    ret void
;
  %a0 = load i64, ptr @a64, align 8
  %a1 = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 1), align 8
  %a2 = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 2), align 8
  %a3 = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 3), align 8
  %a4 = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 4), align 8
  %a5 = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 5), align 8
  %a6 = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 6), align 8
  %a7 = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 7), align 8
  %b0 = load i64, ptr @b64, align 8
  %b1 = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 1), align 8
  %b2 = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 2), align 8
  %b3 = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 3), align 8
  %b4 = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 4), align 8
  %b5 = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 5), align 8
  %b6 = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 6), align 8
  %b7 = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 7), align 8
  %r0 = mul i64 %a0, %b0
  %r1 = mul i64 %a1, %b1
  %r2 = mul i64 %a2, %b2
  %r3 = mul i64 %a3, %b3
  %r4 = mul i64 %a4, %b4
  %r5 = mul i64 %a5, %b5
  %r6 = mul i64 %a6, %b6
  %r7 = mul i64 %a7, %b7
  store i64 %r0, ptr @c64, align 8
  store i64 %r1, ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 1), align 8
  store i64 %r2, ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 2), align 8
  store i64 %r3, ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 3), align 8
  store i64 %r4, ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 4), align 8
  store i64 %r5, ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 5), align 8
  store i64 %r6, ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 6), align 8
  store i64 %r7, ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 7), align 8
  ret void
}

define void @mul_v16i32() {
; SSE-LABEL: @mul_v16i32(
; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @a32, align 4
; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @b32, align 4
; SSE-NEXT:    [[TMP3:%.*]] = mul <4 x i32> [[TMP1]], [[TMP2]]
; SSE-NEXT:    store <4 x i32> [[TMP3]], ptr @c32, align 4
; SSE-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 4), align 4
; SSE-NEXT:    [[TMP5:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 4), align 4
; SSE-NEXT:    [[TMP6:%.*]] = mul <4 x i32> [[TMP4]], [[TMP5]]
; SSE-NEXT:    store <4 x i32> [[TMP6]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 4), align 4
; SSE-NEXT:    [[TMP7:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 8), align 4
; SSE-NEXT:    [[TMP8:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 8), align 4
; SSE-NEXT:    [[TMP9:%.*]] = mul <4 x i32> [[TMP7]], [[TMP8]]
; SSE-NEXT:    store <4 x i32> [[TMP9]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 8), align 4
; SSE-NEXT:    [[TMP10:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 12), align 4
; SSE-NEXT:    [[TMP11:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 12), align 4
; SSE-NEXT:    [[TMP12:%.*]] = mul <4 x i32> [[TMP10]], [[TMP11]]
; SSE-NEXT:    store <4 x i32> [[TMP12]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 12), align 4
; SSE-NEXT:    ret void
;
; SLM-LABEL: @mul_v16i32(
; SLM-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @a32, align 4
; SLM-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @b32, align 4
; SLM-NEXT:    [[TMP3:%.*]] = mul <4 x i32> [[TMP1]], [[TMP2]]
; SLM-NEXT:    store <4 x i32> [[TMP3]], ptr @c32, align 4
; SLM-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 4), align 4
; SLM-NEXT:    [[TMP5:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 4), align 4
; SLM-NEXT:    [[TMP6:%.*]] = mul <4 x i32> [[TMP4]], [[TMP5]]
; SLM-NEXT:    store <4 x i32> [[TMP6]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 4), align 4
; SLM-NEXT:    [[TMP7:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 8), align 4
; SLM-NEXT:    [[TMP8:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 8), align 4
; SLM-NEXT:    [[TMP9:%.*]] = mul <4 x i32> [[TMP7]], [[TMP8]]
; SLM-NEXT:    store <4 x i32> [[TMP9]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 8), align 4
; SLM-NEXT:    [[TMP10:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 12), align 4
; SLM-NEXT:    [[TMP11:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 12), align 4
; SLM-NEXT:    [[TMP12:%.*]] = mul <4 x i32> [[TMP10]], [[TMP11]]
; SLM-NEXT:    store <4 x i32> [[TMP12]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 12), align 4
; SLM-NEXT:    ret void
;
; AVX128-LABEL: @mul_v16i32(
; AVX128-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @a32, align 4
; AVX128-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @b32, align 4
; AVX128-NEXT:    [[TMP3:%.*]] = mul <4 x i32> [[TMP1]], [[TMP2]]
; AVX128-NEXT:    store <4 x i32> [[TMP3]], ptr @c32, align 4
; AVX128-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 4), align 4
; AVX128-NEXT:    [[TMP5:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 4), align 4
; AVX128-NEXT:    [[TMP6:%.*]] = mul <4 x i32> [[TMP4]], [[TMP5]]
; AVX128-NEXT:    store <4 x i32> [[TMP6]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 4), align 4
; AVX128-NEXT:    [[TMP7:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 8), align 4
; AVX128-NEXT:    [[TMP8:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 8), align 4
; AVX128-NEXT:    [[TMP9:%.*]] = mul <4 x i32> [[TMP7]], [[TMP8]]
; AVX128-NEXT:    store <4 x i32> [[TMP9]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 8), align 4
; AVX128-NEXT:    [[TMP10:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 12), align 4
; AVX128-NEXT:    [[TMP11:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 12), align 4
; AVX128-NEXT:    [[TMP12:%.*]] = mul <4 x i32> [[TMP10]], [[TMP11]]
; AVX128-NEXT:    store <4 x i32> [[TMP12]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 12), align 4
; AVX128-NEXT:    ret void
;
; AVX256-LABEL: @mul_v16i32(
; AVX256-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @a32, align 4
; AVX256-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr @b32, align 4
; AVX256-NEXT:    [[TMP3:%.*]] = mul <8 x i32> [[TMP1]], [[TMP2]]
; AVX256-NEXT:    store <8 x i32> [[TMP3]], ptr @c32, align 4
; AVX256-NEXT:    [[TMP4:%.*]] = load <8 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 8), align 4
; AVX256-NEXT:    [[TMP5:%.*]] = load <8 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 8), align 4
; AVX256-NEXT:    [[TMP6:%.*]] = mul <8 x i32> [[TMP4]], [[TMP5]]
; AVX256-NEXT:    store <8 x i32> [[TMP6]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 8), align 4
; AVX256-NEXT:    ret void
;
; AVX512-LABEL: @mul_v16i32(
; AVX512-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @a32, align 4
; AVX512-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @b32, align 4
; AVX512-NEXT:    [[TMP3:%.*]] = mul <16 x i32> [[TMP1]], [[TMP2]]
; AVX512-NEXT:    store <16 x i32> [[TMP3]], ptr @c32, align 4
; AVX512-NEXT:    ret void
;
  %a0  = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 0 ), align 4
  %a1  = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 1 ), align 4
  %a2  = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 2 ), align 4
  %a3  = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 3 ), align 4
  %a4  = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 4 ), align 4
  %a5  = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 5 ), align 4
  %a6  = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 6 ), align 4
  %a7  = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 7 ), align 4
  %a8  = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 8 ), align 4
  %a9  = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 9 ), align 4
  %a10 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 10), align 4
  %a11 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 11), align 4
  %a12 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 12), align 4
  %a13 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 13), align 4
  %a14 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 14), align 4
  %a15 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 15), align 4
  %b0  = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 0 ), align 4
  %b1  = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 1 ), align 4
  %b2  = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 2 ), align 4
  %b3  = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 3 ), align 4
  %b4  = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 4 ), align 4
  %b5  = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 5 ), align 4
  %b6  = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 6 ), align 4
  %b7  = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 7 ), align 4
  %b8  = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 8 ), align 4
  %b9  = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 9 ), align 4
  %b10 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 10), align 4
  %b11 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 11), align 4
  %b12 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 12), align 4
  %b13 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 13), align 4
  %b14 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 14), align 4
  %b15 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 15), align 4
  %r0  = mul i32 %a0 , %b0
  %r1  = mul i32 %a1 , %b1
  %r2  = mul i32 %a2 , %b2
  %r3  = mul i32 %a3 , %b3
  %r4  = mul i32 %a4 , %b4
  %r5  = mul i32 %a5 , %b5
  %r6  = mul i32 %a6 , %b6
  %r7  = mul i32 %a7 , %b7
  %r8  = mul i32 %a8 , %b8
  %r9  = mul i32 %a9 , %b9
  %r10 = mul i32 %a10, %b10
  %r11 = mul i32 %a11, %b11
  %r12 = mul i32 %a12, %b12
  %r13 = mul i32 %a13, %b13
  %r14 = mul i32 %a14, %b14
  %r15 = mul i32 %a15, %b15
  store i32 %r0 , ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 0 ), align 4
  store i32 %r1 , ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 1 ), align 4
  store i32 %r2 , ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 2 ), align 4
  store i32 %r3 , ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 3 ), align 4
  store i32 %r4 , ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 4 ), align 4
  store i32 %r5 , ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 5 ), align 4
  store i32 %r6 , ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 6 ), align 4
  store i32 %r7 , ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 7 ), align 4
  store i32 %r8 , ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 8 ), align 4
  store i32 %r9 , ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 9 ), align 4
  store i32 %r10, ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 10), align 4
  store i32 %r11, ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 11), align 4
  store i32 %r12, ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 12), align 4
  store i32 %r13, ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 13), align 4
  store i32 %r14, ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 14), align 4
  store i32 %r15, ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 15), align 4
  ret void
}

define void @mul_v32i16() {
; SSE-LABEL: @mul_v32i16(
; SSE-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr @a16, align 2
; SSE-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr @b16, align 2
; SSE-NEXT:    [[TMP3:%.*]] = mul <8 x i16> [[TMP1]], [[TMP2]]
; SSE-NEXT:    store <8 x i16> [[TMP3]], ptr @c16, align 2
; SSE-NEXT:    [[TMP4:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 8), align 2
; SSE-NEXT:    [[TMP5:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 8), align 2
; SSE-NEXT:    [[TMP6:%.*]] = mul <8 x i16> [[TMP4]], [[TMP5]]
; SSE-NEXT:    store <8 x i16> [[TMP6]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 8), align 2
; SSE-NEXT:    [[TMP7:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 16), align 2
; SSE-NEXT:    [[TMP8:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 16), align 2
; SSE-NEXT:    [[TMP9:%.*]] = mul <8 x i16> [[TMP7]], [[TMP8]]
; SSE-NEXT:    store <8 x i16> [[TMP9]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 16), align 2
; SSE-NEXT:    [[TMP10:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 24), align 2
; SSE-NEXT:    [[TMP11:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 24), align 2
; SSE-NEXT:    [[TMP12:%.*]] = mul <8 x i16> [[TMP10]], [[TMP11]]
; SSE-NEXT:    store <8 x i16> [[TMP12]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 24), align 2
; SSE-NEXT:    ret void
;
; SLM-LABEL: @mul_v32i16(
; SLM-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr @a16, align 2
; SLM-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr @b16, align 2
; SLM-NEXT:    [[TMP3:%.*]] = mul <8 x i16> [[TMP1]], [[TMP2]]
; SLM-NEXT:    store <8 x i16> [[TMP3]], ptr @c16, align 2
; SLM-NEXT:    [[TMP4:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 8), align 2
; SLM-NEXT:    [[TMP5:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 8), align 2
; SLM-NEXT:    [[TMP6:%.*]] = mul <8 x i16> [[TMP4]], [[TMP5]]
; SLM-NEXT:    store <8 x i16> [[TMP6]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 8), align 2
; SLM-NEXT:    [[TMP7:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 16), align 2
; SLM-NEXT:    [[TMP8:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 16), align 2
; SLM-NEXT:    [[TMP9:%.*]] = mul <8 x i16> [[TMP7]], [[TMP8]]
; SLM-NEXT:    store <8 x i16> [[TMP9]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 16), align 2
; SLM-NEXT:    [[TMP10:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 24), align 2
; SLM-NEXT:    [[TMP11:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 24), align 2
; SLM-NEXT:    [[TMP12:%.*]] = mul <8 x i16> [[TMP10]], [[TMP11]]
; SLM-NEXT:    store <8 x i16> [[TMP12]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 24), align 2
; SLM-NEXT:    ret void
;
; AVX128-LABEL: @mul_v32i16(
; AVX128-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr @a16, align 2
; AVX128-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr @b16, align 2
; AVX128-NEXT:    [[TMP3:%.*]] = mul <8 x i16> [[TMP1]], [[TMP2]]
; AVX128-NEXT:    store <8 x i16> [[TMP3]], ptr @c16, align 2
; AVX128-NEXT:    [[TMP4:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 8), align 2
; AVX128-NEXT:    [[TMP5:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 8), align 2
; AVX128-NEXT:    [[TMP6:%.*]] = mul <8 x i16> [[TMP4]], [[TMP5]]
; AVX128-NEXT:    store <8 x i16> [[TMP6]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 8), align 2
; AVX128-NEXT:    [[TMP7:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 16), align 2
; AVX128-NEXT:    [[TMP8:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 16), align 2
; AVX128-NEXT:    [[TMP9:%.*]] = mul <8 x i16> [[TMP7]], [[TMP8]]
; AVX128-NEXT:    store <8 x i16> [[TMP9]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 16), align 2
; AVX128-NEXT:    [[TMP10:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 24), align 2
; AVX128-NEXT:    [[TMP11:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 24), align 2
; AVX128-NEXT:    [[TMP12:%.*]] = mul <8 x i16> [[TMP10]], [[TMP11]]
; AVX128-NEXT:    store <8 x i16> [[TMP12]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 24), align 2
; AVX128-NEXT:    ret void
;
; AVX256-LABEL: @mul_v32i16(
; AVX256-NEXT:    [[TMP1:%.*]] = load <16 x i16>, ptr @a16, align 2
; AVX256-NEXT:    [[TMP2:%.*]] = load <16 x i16>, ptr @b16, align 2
; AVX256-NEXT:    [[TMP3:%.*]] = mul <16 x i16> [[TMP1]], [[TMP2]]
; AVX256-NEXT:    store <16 x i16> [[TMP3]], ptr @c16, align 2
; AVX256-NEXT:    [[TMP4:%.*]] = load <16 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 16), align 2
; AVX256-NEXT:    [[TMP5:%.*]] = load <16 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 16), align 2
; AVX256-NEXT:    [[TMP6:%.*]] = mul <16 x i16> [[TMP4]], [[TMP5]]
; AVX256-NEXT:    store <16 x i16> [[TMP6]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 16), align 2
; AVX256-NEXT:    ret void
;
; AVX512-LABEL: @mul_v32i16(
; AVX512-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr @a16, align 2
; AVX512-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr @b16, align 2
; AVX512-NEXT:    [[TMP3:%.*]] = mul <32 x i16> [[TMP1]], [[TMP2]]
; AVX512-NEXT:    store <32 x i16> [[TMP3]], ptr @c16, align 2
; AVX512-NEXT:    ret void
;
  %a0  = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 0 ), align 2
  %a1  = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 1 ), align 2
  %a2  = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 2 ), align 2
  %a3  = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 3 ), align 2
  %a4  = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 4 ), align 2
  %a5  = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 5 ), align 2
  %a6  = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 6 ), align 2
  %a7  = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 7 ), align 2
  %a8  = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 8 ), align 2
  %a9  = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 9 ), align 2
  %a10 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 10), align 2
  %a11 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 11), align 2
  %a12 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 12), align 2
  %a13 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 13), align 2
  %a14 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 14), align 2
  %a15 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 15), align 2
  %a16 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 16), align 2
  %a17 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 17), align 2
  %a18 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 18), align 2
  %a19 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 19), align 2
  %a20 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 20), align 2
  %a21 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 21), align 2
  %a22 = 