-
Notifications
You must be signed in to change notification settings - Fork 42
233 lines (200 loc) · 9.64 KB
/
triton-benchmarks.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
name: Triton benchmarks
on:
workflow_dispatch:
inputs:
runner_label:
description: Runner label, keep empty for default
type: string
default: ""
tag:
description: Tag for benchmark results
type: string
default: "test"
benchmarking_method:
description: The method used to obtain performance numbers
type: choice
options:
- PYTORCH_LEGACY_PROFILER_USING_IPEX
- ELAPSED_TIME
- UPSTREAM_PYTORCH_PROFILER
default: PYTORCH_LEGACY_PROFILER_USING_IPEX
schedule:
- cron: "5 23 * * *"
pull_request:
branches:
- main
paths:
- .github/workflows/triton-benchmarks.yml
- benchmarks/**
permissions: read-all
env:
PYTHON_VERSION: "3.10"
BENCHMARKING_METHOD: ${{ inputs.benchmarking_method || 'PYTORCH_LEGACY_PROFILER_USING_IPEX' }}
USE_IPEX: ${{ github.event_name != 'workflow_dispatch' && '1' || inputs.benchmarking_method == 'PYTORCH_LEGACY_PROFILER_USING_IPEX' && '1' || '0' }}
TAG: ${{ inputs.tag || (github.event_name == 'pull_request' && 'pr') || (github.event_name == 'schedule' && 'ci') || 'test' }}
jobs:
build:
name: Triton benchmarks
runs-on:
- ${{ inputs.runner_label || 'max1550' }}
timeout-minutes: 720
defaults:
run:
shell: bash -noprofile --norc -eo pipefail -c "source /opt/intel/oneapi/setvars.sh > /dev/null; source {0}"
steps:
- name: Checkout repository
uses: actions/checkout@v4
- name: Load pip cache
id: pip-cache
uses: ./.github/actions/load
with:
path: $HOME/.cache/pip
# pip cache per commit id just to minimize network traffic
key: pip-$PYTHON_VERSION-$GITHUB_SHA
- name: Install Python
uses: actions/setup-python@v5
with:
python-version: ${{ env.PYTHON_VERSION }}
- name: Install Python build dependencies
run: |
pip install wheel cmake
- name: Setup PyTorch with IPEX
if: ${{ env.USE_IPEX == '1' }}
uses: ./.github/actions/setup-pytorch
with:
repository: Stonepia/pytorch
- name: Setup PyTorch without IPEX
if: ${{ env.USE_IPEX == '0' }}
uses: ./.github/actions/setup-pytorch
with:
repository: pytorch/pytorch
- name: Setup IPEX
if: ${{ env.USE_IPEX == '1' }}
uses: ./.github/actions/setup-ipex
- name: Build Triton wheels
uses: ./.github/actions/setup-triton
with:
command: DEBUG=1 python setup.py bdist_wheel
- name: Install Triton
run: |
pip install python/dist/*.whl
- name: Install benchmark dependencies
run: |
pip install matplotlib pandas tabulate
- name: Create reports dir
run: |
mkdir reports
echo "REPORTS=$PWD/reports" >> $GITHUB_ENV
- name: Install benchmarks
id: install
run: |
cd benchmarks
python setup.py install
- name: Run Triton GEMM kernel benchmark - default path (before)
if: ${{ steps.install.outcome == 'success' && !cancelled() }}
run: |
cd benchmarks/triton_kernels_benchmark
# Default path:
TRITON_INTEL_ADVANCED_PATH=0 \
TRITON_INTEL_ENABLE_ADDRESS_PAYLOAD_OPT=1 \
python gemm_benchmark.py --reports $REPORTS
mv $REPORTS/matmul-performance.csv $REPORTS/matmul-performance-default-path.csv
source ../../scripts/capture-hw-details.sh
TAG="${TAG}-dflt"
python ../../scripts/build_report.py $REPORTS/matmul-performance-default-path.csv $REPORTS/gemm-triton-default-report.csv --benchmark gemm --compiler triton --param_cols "B,M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
- name: Run Triton GEMM kernel benchmark - default path
if: ${{ steps.install.outcome == 'success' && !cancelled() }}
run: |
cd benchmarks/triton_kernels_benchmark
# Default path:
TRITON_INTEL_ADVANCED_PATH=0 \
TRITON_INTEL_ENABLE_ADDRESS_PAYLOAD_OPT=1 \
IGC_VISAOptions=" -enableBCR -nolocalra" \
IGC_DisableLoopUnroll=1 \
python gemm_benchmark.py --reports $REPORTS
mv $REPORTS/matmul-performance.csv $REPORTS/matmul-performance-default-path.csv
source ../../scripts/capture-hw-details.sh
TAG="${TAG}-dflt"
python ../../scripts/build_report.py $REPORTS/matmul-performance-default-path.csv $REPORTS/gemm-triton-default-report.csv --benchmark gemm --compiler triton --param_cols "B,M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
- name: Run Triton GEMM kernel benchmark - advanced path (before)
if: ${{ steps.install.outcome == 'success' && !cancelled() }}
run: |
cd benchmarks/triton_kernels_benchmark
# Advanced path:
TRITON_INTEL_ADVANCED_PATH=1 \
TRITON_INTEL_ENABLE_ADDRESS_PAYLOAD_OPT=1 \
IGC_VISAOptions=" -enableBCR -nolocalra" \
IGC_DisableLoopUnroll=1 \
python gemm_benchmark.py --reports $REPORTS
mv $REPORTS/matmul-performance.csv $REPORTS/matmul-performance-adv-path.csv
source ../../scripts/capture-hw-details.sh
TAG="${TAG}-adv"
python ../../scripts/build_report.py $REPORTS/matmul-performance-adv-path.csv $REPORTS/gemm-triton-advanced-report.csv --benchmark gemm --compiler triton --param_cols "B,M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
- name: Run Triton GEMM kernel benchmark - advanced path
if: ${{ steps.install.outcome == 'success' && !cancelled() }}
run: |
cd benchmarks/triton_kernels_benchmark
# Advanced path:
TRITON_INTEL_ADVANCED_PATH=1 \
TRITON_INTEL_ENABLE_ADDRESS_PAYLOAD_OPT=1 \
python gemm_benchmark.py --reports $REPORTS
mv $REPORTS/matmul-performance.csv $REPORTS/matmul-performance-adv-path.csv
source ../../scripts/capture-hw-details.sh
TAG="${TAG}-adv"
python ../../scripts/build_report.py $REPORTS/matmul-performance-adv-path.csv $REPORTS/gemm-triton-advanced-report.csv --benchmark gemm --compiler triton --param_cols "B,M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
- name: Run Triton FA kernel benchmark - default path (before)
if: ${{ steps.install.outcome == 'success' && !cancelled() }}
run: |
cd benchmarks/triton_kernels_benchmark
TRITON_INTEL_ADVANCED_PATH=0 \
TRITON_INTEL_ENABLE_ADDRESS_PAYLOAD_OPT=1 \
IGC_VISAOptions=" -enableBCR" \
python flash_attention_fwd_benchmark.py --reports $REPORTS
TAG="${TAG}-dflt"
source ../../scripts/capture-hw-details.sh
python ../../scripts/build_report.py $REPORTS/attn-performance.csv $REPORTS/attn-triton-default-report.csv --benchmark attn --compiler triton --param_cols "Z,H,N_CTX,D_HEAD,CAUSAL" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
- name: Run Triton FA kernel benchmark - default path
if: ${{ steps.install.outcome == 'success' && !cancelled() }}
run: |
cd benchmarks/triton_kernels_benchmark
TRITON_INTEL_ADVANCED_PATH=0 \
TRITON_INTEL_ENABLE_ADDRESS_PAYLOAD_OPT=1 \
python flash_attention_fwd_benchmark.py --reports $REPORTS
TAG="${TAG}-dflt"
source ../../scripts/capture-hw-details.sh
python ../../scripts/build_report.py $REPORTS/attn-performance.csv $REPORTS/attn-triton-default-report.csv --benchmark attn --compiler triton --param_cols "Z,H,N_CTX,D_HEAD,CAUSAL" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
- name: Run Triton FA kernel benchmark - advanced path (before)
if: ${{ steps.install.outcome == 'success' && !cancelled() }}
run: |
cd benchmarks/triton_kernels_benchmark
TRITON_INTEL_ADVANCED_PATH=1 \
TRITON_INTEL_ENABLE_INSTR_SCHED=1 \
TRITON_INTEL_ENABLE_ADDRESS_PAYLOAD_OPT=1 \
IGC_VISAOptions=" -enableBCR" \
python flash_attention_fwd_benchmark.py --reports $REPORTS
TAG="${TAG}-adv"
source ../../scripts/capture-hw-details.sh
python ../../scripts/build_report.py $REPORTS/attn-performance.csv $REPORTS/attn-triton-advanced-report.csv --benchmark attn --compiler triton --param_cols "Z,H,N_CTX,D_HEAD,CAUSAL" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
- name: Run Triton FA kernel benchmark - advanced path
if: ${{ steps.install.outcome == 'success' && !cancelled() }}
run: |
cd benchmarks/triton_kernels_benchmark
TRITON_INTEL_ADVANCED_PATH=1 \
TRITON_INTEL_ENABLE_INSTR_SCHED=1 \
TRITON_INTEL_ENABLE_ADDRESS_PAYLOAD_OPT=1 \
python flash_attention_fwd_benchmark.py --reports $REPORTS
TAG="${TAG}-adv"
source ../../scripts/capture-hw-details.sh
python ../../scripts/build_report.py $REPORTS/attn-performance.csv $REPORTS/attn-triton-advanced-report.csv --benchmark attn --compiler triton --param_cols "Z,H,N_CTX,D_HEAD,CAUSAL" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
- name: Save pip cache
if: ${{ steps.pip-cache.outputs.status == 'miss' }}
uses: ./.github/actions/save
with:
path: ${{ steps.pip-cache.outputs.path }}
dest: ${{ steps.pip-cache.outputs.dest }}
- name: Upload benchmark reports
if: ${{ steps.install.outcome == 'success' && !cancelled() }}
uses: actions/upload-artifact@v4
with:
name: benchmark-reports
path: reports