Comment atteindre le maximum théorique de 4 FLOP par cycle ?
Il est théoriquement possible d'atteindre une performance maximale de 4 virgules flottantes opérations (double précision) par cycle sur les processeurs Intel x86-64 modernes, en utilisant les techniques suivantes :
Optimisation du code pour les instructions SSE
Déroulement de boucle et entrelacement
Regrouper les opérations dans trois
Éviter les blocages et les dépendances inutiles
Exemple de code
L'extrait de code suivant montre comment atteindre des performances proches des performances maximales sur les processeurs Intel Core i5 et Core i7 :
#include <emmintrin.h> #include <omp.h> #include <iostream> using namespace std; typedef unsigned long long uint64; double test_dp_mac_SSE(double x, double y, uint64 iterations) { register __m128d r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, rA, rB, rC, rD, rE, rF; // Generate starting data. r0 = _mm_set1_pd(x); r1 = _mm_set1_pd(y); r8 = _mm_set1_pd(-0.0); r2 = _mm_xor_pd(r0, r8); r3 = _mm_or_pd(r0, r8); r4 = _mm_andnot_pd(r8, r0); r5 = _mm_mul_pd(r1, _mm_set1_pd(0.37796447300922722721)); r6 = _mm_mul_pd(r1, _mm_set1_pd(0.24253562503633297352)); r7 = _mm_mul_pd(r1, _mm_set1_pd(4.1231056256176605498)); r8 = _mm_add_pd(r0, _mm_set1_pd(0.37796447300922722721)); r9 = _mm_add_pd(r1, _mm_set1_pd(0.24253562503633297352)); rA = _mm_sub_pd(r0, _mm_set1_pd(4.1231056256176605498)); rB = _mm_sub_pd(r1, _mm_set1_pd(4.1231056256176605498)); rC = _mm_set1_pd(1.4142135623730950488); rD = _mm_set1_pd(1.7320508075688772935); rE = _mm_set1_pd(0.57735026918962576451); rF = _mm_set1_pd(0.70710678118654752440); uint64 iMASK = 0x800fffffffffffffull; __m128d MASK = _mm_set1_pd(*(double*)&iMASK); __m128d vONE = _mm_set1_pd(1.0); uint64 c = 0; while (c < iterations) { size_t i = 0; while (i < 1000) { // Main computational loop r0 = _mm_mul_pd(r0, rC); r1 = _mm_add_pd(r1, rD); r2 = _mm_mul_pd(r2, rE); r3 = _mm_sub_pd(r3, rF); r4 = _mm_mul_pd(r4, rC); r5 = _mm_add_pd(r5, rD); r6 = _mm_mul_pd(r6, rE); r7 = _mm_sub_pd(r7, rF); r8 = _mm_mul_pd(r8, rC); r9 = _mm_add_pd(r9, rD); rA = _mm_mul_pd(rA, rE); rB = _mm_sub_pd(rB, rF); r0 = _mm_add_pd(r0, rF); r1 = _mm_mul_pd(r1, rE); r2 = _mm_sub_pd(r2, rD); r3 = _mm_mul_pd(r3, rC); r4 = _mm_add_pd(r4, rF); r5 = _mm_mul_pd(r5, rE); r6 = _mm_sub_pd(r6, rD); r7 = _mm_mul_pd(r7, rC); r8 = _mm_add_pd(r8, rF); r9 = _mm_mul_pd(r9, rE); rA = _mm_sub_pd(rA, rD); rB = _mm_mul_pd(rB, rC); r0 = _mm_mul_pd(r0, rC); r1 = _mm_add_pd(r1, rD); r2 = _mm_mul_pd(r2, rE); r3 = _mm_sub_pd(r3, rF); r4 = _mm_mul_pd(r4, rC); r5 = _mm_add_pd(r5, rD); r6 = _mm_mul_pd(r6, rE); r7 = _mm_sub_pd(r7, rF); r8 = _mm_mul_pd(r8, rC); r9 = _mm_add_pd(r9, rD);
Ce qui précède est le contenu détaillé de. pour plus d'informations, suivez d'autres articles connexes sur le site Web de PHP en chinois!