SlideShare uma empresa Scribd logo
1 de 46
3,100,000
Transistors
1.4 Billion
Transistors
Ivy Bridge
1971, Intel, 4004
4bits processor
740Khz(8 clock
cycle/instruction cycle)
2012, Intel, 80637
(Ivy Bridge)
64bits processor
4.1Ghz
How(Old Features)
Multithread
Programming
OpenMP PPL / TPL
How(VS2012 New Features)
Auto-
Vectorization
Auto-
Parallelization
C++ AMP
CPU
images source: AMD
performance
portability
productivity
void AddArrays(int n, int * pA, int * pB, int * pC)
{
for (int i=0; i<n; i++)
{
pC[i] = pA[i] + pB[i];
}
}
void AddArrays(int n, int * pA, int * pB, int * pC)
{
for (int i=0; i<n; i++)
{
pC[i] = pA[i] + pB[i];
}
}
#include <amp.h>
using namespace concurrency;
void AddArrays(int n, int * pA, int * pB, int * pC)
{
array_view<int,1> a(n, pA);
array_view<int,1> b(n, pB);
array_view<int,1> sum(n, pC);
parallel_for_each(
sum.extent,
[=](index<1> i) restrict(amp)
{
sum[i] = a[i] + b[i];
}
);
}
void AddArrays(int n, int * pA, int * pB, int * pC)
{
array_view<int,1> a(n, pA);
array_view<int,1> b(n, pB);
array_view<int,1> sum(n, pC);
parallel_for_each(
sum.extent,
[=](index<1> i) restrict(amp)
{
sum[i] = a[i] + b[i];
}
);
}
parallel_for_each:
execute the lambda on
the accelerator once per
thread
extent: the number
and shape of threads to
execute the lambda
index: the thread ID
that is running the
lambda, used to index
into data
restrict(amp): tells the compiler to
check that this code conforms to C++
AMP language restriction
array_view: wrap the data to operate
on the accelerator
array_view variables captured and associated
data copied to accelerator(on demand)
index<1> i(2); index<3> i(2,0,1);
extent<3> e(3,2,2);
index<2> i(0,2);
extent<2> e(3,4);extent<1> e(6);
N <= 128
vector<int> v(10);
extent<2> e(2,5);
array_view<int,2> a(e, v);
//above two lines can also be written
//array_view<int,2> a(2,5,v);
index<2> i(1,3);
int o = a[i]; // or a[i] = 16;
//or int o = a(1, 3);
1. parallel_for_each(
2. e, // e is of type extent<N>
3. [ ](index<N> idx) restrict(amp)
{
// kernel code
}
1. );
• No
• recursion
• 'volatile'
• virtual functions
• pointers to functions
• pointers to member functions
• pointers in structs
• pointers to pointers
• bitfields
• No
• goto or labeled statements
• throw, try, catch
• globals or statics
• dynamic_cast or typeid
• asm declarations
• varargs
• unsupported types
• e.g. char, short, long double
double cos( double d ); // 1a: cpu code
double cos( double d ) restrict(amp); // 1b: amp code
double bar( double d ) restrict(cpu,amp); // 2 : common subset of both
void some_method(array_view<double,2>& c) {
parallel_for_each( c.extent, [=](index<2> idx) restrict(amp)
{
//…
double d0 = c[idx];
double d1 = bar(d0); // ok, bar restrictions include amp
double d2 = cos(d0); // ok, chooses amp overload
//…
});
}
void MatrixMultiplySerial( vector<float>& vC,
const vector<float>& vA,
const vector<float>& vB, int M, int N, int W )
{
for (int row = 0; row < M; row++) {
for (int col = 0; col < N; col++){
float sum = 0.0f;
for(int i = 0; i < W; i++)
sum += vA[row * W + i] * vB[i * N + col];
vC[row * N + col] = sum;
}
}
}
void MatrixMultiplyAMP( vector<float>& vC,
const vector<float>& vA,
const vector<float>& vB, int M, int N, int W )
{
array_view<const float,2> a(M,W,vA),b(W,N,vB);
array_view<float,2> c(M,N,vC);
c.discard_data();
parallel_for_each(c.extent,
[=](index<2> idx) restrict(amp) {
int row = idx[0]; int col = idx[1];
float sum = 0.0f;
for(int i = 0; i < W; i++)
sum += a(row, i) * b(i, col);
c[idx] = sum;
}
);
}
CPUs
System memory
GPUPCIe
GPU
GPU
GPU
Host Accelerator (e.g. discrete G
// enumerate all accelerators
vector<accelerator> accs = accelerator::get_all();
// choose one based on your criteria
accelerator acc = accs[0];
// launch a kernel on it
parallel_for_each(acc.default_view, my_extent, [=]…);
DEMO
vector<int> v(5);
iota(v.begin(), v.end(), 0);
array<int, 1> a(5, v.begin(), v.end());
parallel_for_each(a.extent, [&](index<1> idx)
restrict (amp)
{
a[idx] = a[idx] * 2;
});
copy(a, v.begin());
vector<int> v(5);
iota(v.begin(), v.end(), 0);
array_view<int, 1> av(5, v);
parallel_for_each(av.extent, [=](index<1> idx)
restrict (amp)
{
av[idx] = av[idx] * 2;
});
av.synchronize();
1 accelerator_view acc_view = accelerator().default_view;
2
3 int a = 10;
4 std::vector<int> view_data(size);
5 array_view<int, 1> arr_view(size, view_data);
6 std::vector<int> arr_data(size);
7 array<int, 1> arr(size, arr_data.begin(),acc_view);
8
9 // ‘arr_view’ copied by value
10 // ‘a’ copied by value.
11 // ‘arr’ already allocated on accelerator and copied by ref
12
13 parallel_for_each(
14 arr.extent,
15 [=,&arr](index<1> idx) restrict(amp)
16 {
17 arr[idx] = arr_view[idx] + a;
18 arr_view[idx]++;
19 });
20
21 // ‘arr’ needs to be explictly copied back
22 copy(arr, arr_data.begin());
23
24 // ‘arr_view’ is synchronized back on first use
25 std::cout << arr_view[0] << std::endl;
1. #include <amp.h>
2. #include <amp_math.h>
3. using namespace concurrency;
4. using namespace concurrency::fast_math;
// using namespace
concurrency::precise_math;
5. int main() {
6. float a = 2.2f, b = 3.5f;
7. float result = pow(a,b);
8. std::vector<float> v(1);
9. array_view<float> av(1,v);
10. parallel_for_each(av.extent, [=](index<1>
idx) restrict(amp)
11. {
12. av[idx] = pow(a,b);
13. });
14. }
DEMO
C++ AMP 실천 및 적용 전략
C++ AMP 실천 및 적용 전략
C++ AMP 실천 및 적용 전략
C++ AMP 실천 및 적용 전략
C++ AMP 실천 및 적용 전략
C++ AMP 실천 및 적용 전략
C++ AMP 실천 및 적용 전략
C++ AMP 실천 및 적용 전략

Mais conteúdo relacionado

Mais procurados

Java lejos-multithreading
Java lejos-multithreadingJava lejos-multithreading
Java lejos-multithreading
Mr. Chanuwan
 

Mais procurados (20)

A Speculative Technique for Auto-Memoization Processor with Multithreading
A Speculative Technique for Auto-Memoization Processor with MultithreadingA Speculative Technique for Auto-Memoization Processor with Multithreading
A Speculative Technique for Auto-Memoization Processor with Multithreading
 
Low-level Shader Optimization for Next-Gen and DX11 by Emil Persson
Low-level Shader Optimization for Next-Gen and DX11 by Emil PerssonLow-level Shader Optimization for Next-Gen and DX11 by Emil Persson
Low-level Shader Optimization for Next-Gen and DX11 by Emil Persson
 
【論文紹介】Relay: A New IR for Machine Learning Frameworks
【論文紹介】Relay: A New IR for Machine Learning Frameworks【論文紹介】Relay: A New IR for Machine Learning Frameworks
【論文紹介】Relay: A New IR for Machine Learning Frameworks
 
OpenCL Heterogeneous Parallel Computing
OpenCL Heterogeneous Parallel ComputingOpenCL Heterogeneous Parallel Computing
OpenCL Heterogeneous Parallel Computing
 
Introduction to Data Oriented Design
Introduction to Data Oriented DesignIntroduction to Data Oriented Design
Introduction to Data Oriented Design
 
A Step Towards Data Orientation
A Step Towards Data OrientationA Step Towards Data Orientation
A Step Towards Data Orientation
 
An Open Discussion of RISC-V BitManip, trends, and comparisons _ Claire
 An Open Discussion of RISC-V BitManip, trends, and comparisons _ Claire An Open Discussion of RISC-V BitManip, trends, and comparisons _ Claire
An Open Discussion of RISC-V BitManip, trends, and comparisons _ Claire
 
第二回 冬のスイッチ大勉強会 - FullColorLED & MPU-6050編 -
第二回 冬のスイッチ大勉強会 - FullColorLED & MPU-6050編 -第二回 冬のスイッチ大勉強会 - FullColorLED & MPU-6050編 -
第二回 冬のスイッチ大勉強会 - FullColorLED & MPU-6050編 -
 
bpftrace - Tracing Summit 2018
bpftrace - Tracing Summit 2018bpftrace - Tracing Summit 2018
bpftrace - Tracing Summit 2018
 
C++ How I learned to stop worrying and love metaprogramming
C++ How I learned to stop worrying and love metaprogrammingC++ How I learned to stop worrying and love metaprogramming
C++ How I learned to stop worrying and love metaprogramming
 
What&rsquo;s new in Visual C++
What&rsquo;s new in Visual C++What&rsquo;s new in Visual C++
What&rsquo;s new in Visual C++
 
TVM VTA (TSIM)
TVM VTA (TSIM) TVM VTA (TSIM)
TVM VTA (TSIM)
 
PVS-Studio team experience: checking various open source projects, or mistake...
PVS-Studio team experience: checking various open source projects, or mistake...PVS-Studio team experience: checking various open source projects, or mistake...
PVS-Studio team experience: checking various open source projects, or mistake...
 
Конверсия управляемых языков в неуправляемые
Конверсия управляемых языков в неуправляемыеКонверсия управляемых языков в неуправляемые
Конверсия управляемых языков в неуправляемые
 
Lec06
Lec06Lec06
Lec06
 
Java lejos-multithreading
Java lejos-multithreadingJava lejos-multithreading
Java lejos-multithreading
 
Tiramisu をちょっと、味見してみました。
Tiramisu をちょっと、味見してみました。Tiramisu をちょっと、味見してみました。
Tiramisu をちょっと、味見してみました。
 
Introduction to Stockfish bitboard representation and magic bitboard
Introduction to Stockfish bitboard representation and magic bitboardIntroduction to Stockfish bitboard representation and magic bitboard
Introduction to Stockfish bitboard representation and magic bitboard
 
Design and Implementation of GCC Register Allocation
Design and Implementation of GCC Register AllocationDesign and Implementation of GCC Register Allocation
Design and Implementation of GCC Register Allocation
 
Q4.11: Using GCC Auto-Vectorizer
Q4.11: Using GCC Auto-VectorizerQ4.11: Using GCC Auto-Vectorizer
Q4.11: Using GCC Auto-Vectorizer
 

Semelhante a C++ AMP 실천 및 적용 전략

ISCA Final Presentaiton - Compilations
ISCA Final Presentaiton -  CompilationsISCA Final Presentaiton -  Compilations
ISCA Final Presentaiton - Compilations
HSA Foundation
 
ExperiencesSharingOnEmbeddedSystemDevelopment_20160321
ExperiencesSharingOnEmbeddedSystemDevelopment_20160321ExperiencesSharingOnEmbeddedSystemDevelopment_20160321
ExperiencesSharingOnEmbeddedSystemDevelopment_20160321
Teddy Hsiung
 
Embedded c programming22 for fdp
Embedded c programming22 for fdpEmbedded c programming22 for fdp
Embedded c programming22 for fdp
Pradeep Kumar TS
 

Semelhante a C++ AMP 실천 및 적용 전략 (20)

Blazing Fast Windows 8 Apps using Visual C++
Blazing Fast Windows 8 Apps using Visual C++Blazing Fast Windows 8 Apps using Visual C++
Blazing Fast Windows 8 Apps using Visual C++
 
LSFMM 2019 BPF Observability
LSFMM 2019 BPF ObservabilityLSFMM 2019 BPF Observability
LSFMM 2019 BPF Observability
 
Introduction to cuda geek camp singapore 2011
Introduction to cuda   geek camp singapore 2011Introduction to cuda   geek camp singapore 2011
Introduction to cuda geek camp singapore 2011
 
Introduction to CUDA
Introduction to CUDAIntroduction to CUDA
Introduction to CUDA
 
ISCA Final Presentaiton - Compilations
ISCA Final Presentaiton -  CompilationsISCA Final Presentaiton -  Compilations
ISCA Final Presentaiton - Compilations
 
Quiz 9
Quiz 9Quiz 9
Quiz 9
 
Lecture 04
Lecture 04Lecture 04
Lecture 04
 
Managing console
Managing consoleManaging console
Managing console
 
Groovy
GroovyGroovy
Groovy
 
ExperiencesSharingOnEmbeddedSystemDevelopment_20160321
ExperiencesSharingOnEmbeddedSystemDevelopment_20160321ExperiencesSharingOnEmbeddedSystemDevelopment_20160321
ExperiencesSharingOnEmbeddedSystemDevelopment_20160321
 
Adam Sitnik "State of the .NET Performance"
Adam Sitnik "State of the .NET Performance"Adam Sitnik "State of the .NET Performance"
Adam Sitnik "State of the .NET Performance"
 
State of the .Net Performance
State of the .Net PerformanceState of the .Net Performance
State of the .Net Performance
 
Introduction to Accelerators
Introduction to AcceleratorsIntroduction to Accelerators
Introduction to Accelerators
 
Getting Started with Raspberry Pi - DCC 2013.1
Getting Started with Raspberry Pi - DCC 2013.1Getting Started with Raspberry Pi - DCC 2013.1
Getting Started with Raspberry Pi - DCC 2013.1
 
Accelerating microbiome research with OpenACC
Accelerating microbiome research with OpenACCAccelerating microbiome research with OpenACC
Accelerating microbiome research with OpenACC
 
Cpp tutorial
Cpp tutorialCpp tutorial
Cpp tutorial
 
Introduction to CUDA C: NVIDIA : Notes
Introduction to CUDA C: NVIDIA : NotesIntroduction to CUDA C: NVIDIA : Notes
Introduction to CUDA C: NVIDIA : Notes
 
Exploiting GPU's for Columnar DataFrrames by Kiran Lonikar
Exploiting GPU's for Columnar DataFrrames by Kiran LonikarExploiting GPU's for Columnar DataFrrames by Kiran Lonikar
Exploiting GPU's for Columnar DataFrrames by Kiran Lonikar
 
Embedded c programming22 for fdp
Embedded c programming22 for fdpEmbedded c programming22 for fdp
Embedded c programming22 for fdp
 
Евгений Крутько, Многопоточные вычисления, современный подход.
Евгений Крутько, Многопоточные вычисления, современный подход.Евгений Крутько, Многопоточные вычисления, современный подход.
Евгений Крутько, Многопоточные вычисления, современный подход.
 

Mais de 명신 김

Mais de 명신 김 (20)

업무를 빼고 가치를 더하는 클라우드 기술
업무를 빼고 가치를 더하는 클라우드 기술업무를 빼고 가치를 더하는 클라우드 기술
업무를 빼고 가치를 더하는 클라우드 기술
 
[2020 Ignite Seoul]Azure에서 사용할 수 있는 컨테이너/오케스트레이션 기술 살펴보기
[2020 Ignite Seoul]Azure에서 사용할 수 있는 컨테이너/오케스트레이션 기술 살펴보기[2020 Ignite Seoul]Azure에서 사용할 수 있는 컨테이너/오케스트레이션 기술 살펴보기
[2020 Ignite Seoul]Azure에서 사용할 수 있는 컨테이너/오케스트레이션 기술 살펴보기
 
Best of Build Seoul 2019 Keynote
Best of Build Seoul 2019 KeynoteBest of Build Seoul 2019 Keynote
Best of Build Seoul 2019 Keynote
 
Passwordless society
Passwordless societyPasswordless society
Passwordless society
 
DevOps and Azure Devops 소개, 동향, 그리고 기대효과
DevOps and Azure Devops 소개, 동향, 그리고 기대효과DevOps and Azure Devops 소개, 동향, 그리고 기대효과
DevOps and Azure Devops 소개, 동향, 그리고 기대효과
 
Serverless design and adoption
Serverless design and adoptionServerless design and adoption
Serverless design and adoption
 
Durable functions
Durable functionsDurable functions
Durable functions
 
Azure functions v2 announcement
Azure functions v2 announcementAzure functions v2 announcement
Azure functions v2 announcement
 
Azure functions
Azure functionsAzure functions
Azure functions
 
Logic apps
Logic appsLogic apps
Logic apps
 
Serverless
ServerlessServerless
Serverless
 
Azure event grid
Azure event gridAzure event grid
Azure event grid
 
Serverless, Azure Functions, Logic Apps
Serverless, Azure Functions, Logic AppsServerless, Azure Functions, Logic Apps
Serverless, Azure Functions, Logic Apps
 
Microservices architecture
Microservices architectureMicroservices architecture
Microservices architecture
 
Visual studio 2015를 활용한 개발 생산성 및 코드 품질 혁신
Visual studio 2015를 활용한 개발 생산성 및 코드 품질 혁신Visual studio 2015를 활용한 개발 생산성 및 코드 품질 혁신
Visual studio 2015를 활용한 개발 생산성 및 코드 품질 혁신
 
Connect(); 2016 한시간 총정리
Connect(); 2016 한시간 총정리Connect(); 2016 한시간 총정리
Connect(); 2016 한시간 총정리
 
크로스 플랫폼을 품은 오픈 소스 프레임워크 .NET Core
크로스 플랫폼을 품은 오픈 소스 프레임워크 .NET Core크로스 플랫폼을 품은 오픈 소스 프레임워크 .NET Core
크로스 플랫폼을 품은 오픈 소스 프레임워크 .NET Core
 
Coded UI test를 이용한 테스트 자동화
Coded UI test를 이용한 테스트 자동화Coded UI test를 이용한 테스트 자동화
Coded UI test를 이용한 테스트 자동화
 
VS2015 C++ new features
VS2015 C++ new featuresVS2015 C++ new features
VS2015 C++ new features
 
Welcome to the microsoft madness
Welcome to the microsoft madnessWelcome to the microsoft madness
Welcome to the microsoft madness
 

Último

TECUNIQUE: Success Stories: IT Service provider
TECUNIQUE: Success Stories: IT Service providerTECUNIQUE: Success Stories: IT Service provider
TECUNIQUE: Success Stories: IT Service provider
mohitmore19
 
introduction-to-automotive Andoid os-csimmonds-ndctechtown-2021.pdf
introduction-to-automotive Andoid os-csimmonds-ndctechtown-2021.pdfintroduction-to-automotive Andoid os-csimmonds-ndctechtown-2021.pdf
introduction-to-automotive Andoid os-csimmonds-ndctechtown-2021.pdf
VishalKumarJha10
 
CHEAP Call Girls in Pushp Vihar (-DELHI )🔝 9953056974🔝(=)/CALL GIRLS SERVICE
CHEAP Call Girls in Pushp Vihar (-DELHI )🔝 9953056974🔝(=)/CALL GIRLS SERVICECHEAP Call Girls in Pushp Vihar (-DELHI )🔝 9953056974🔝(=)/CALL GIRLS SERVICE
CHEAP Call Girls in Pushp Vihar (-DELHI )🔝 9953056974🔝(=)/CALL GIRLS SERVICE
9953056974 Low Rate Call Girls In Saket, Delhi NCR
 
AI Mastery 201: Elevating Your Workflow with Advanced LLM Techniques
AI Mastery 201: Elevating Your Workflow with Advanced LLM TechniquesAI Mastery 201: Elevating Your Workflow with Advanced LLM Techniques
AI Mastery 201: Elevating Your Workflow with Advanced LLM Techniques
VictorSzoltysek
 

Último (20)

The Ultimate Test Automation Guide_ Best Practices and Tips.pdf
The Ultimate Test Automation Guide_ Best Practices and Tips.pdfThe Ultimate Test Automation Guide_ Best Practices and Tips.pdf
The Ultimate Test Automation Guide_ Best Practices and Tips.pdf
 
TECUNIQUE: Success Stories: IT Service provider
TECUNIQUE: Success Stories: IT Service providerTECUNIQUE: Success Stories: IT Service provider
TECUNIQUE: Success Stories: IT Service provider
 
Chinsurah Escorts ☎️8617697112 Starting From 5K to 15K High Profile Escorts ...
Chinsurah Escorts ☎️8617697112  Starting From 5K to 15K High Profile Escorts ...Chinsurah Escorts ☎️8617697112  Starting From 5K to 15K High Profile Escorts ...
Chinsurah Escorts ☎️8617697112 Starting From 5K to 15K High Profile Escorts ...
 
introduction-to-automotive Andoid os-csimmonds-ndctechtown-2021.pdf
introduction-to-automotive Andoid os-csimmonds-ndctechtown-2021.pdfintroduction-to-automotive Andoid os-csimmonds-ndctechtown-2021.pdf
introduction-to-automotive Andoid os-csimmonds-ndctechtown-2021.pdf
 
CHEAP Call Girls in Pushp Vihar (-DELHI )🔝 9953056974🔝(=)/CALL GIRLS SERVICE
CHEAP Call Girls in Pushp Vihar (-DELHI )🔝 9953056974🔝(=)/CALL GIRLS SERVICECHEAP Call Girls in Pushp Vihar (-DELHI )🔝 9953056974🔝(=)/CALL GIRLS SERVICE
CHEAP Call Girls in Pushp Vihar (-DELHI )🔝 9953056974🔝(=)/CALL GIRLS SERVICE
 
VTU technical seminar 8Th Sem on Scikit-learn
VTU technical seminar 8Th Sem on Scikit-learnVTU technical seminar 8Th Sem on Scikit-learn
VTU technical seminar 8Th Sem on Scikit-learn
 
8257 interfacing 2 in microprocessor for btech students
8257 interfacing 2 in microprocessor for btech students8257 interfacing 2 in microprocessor for btech students
8257 interfacing 2 in microprocessor for btech students
 
Introducing Microsoft’s new Enterprise Work Management (EWM) Solution
Introducing Microsoft’s new Enterprise Work Management (EWM) SolutionIntroducing Microsoft’s new Enterprise Work Management (EWM) Solution
Introducing Microsoft’s new Enterprise Work Management (EWM) Solution
 
%in Stilfontein+277-882-255-28 abortion pills for sale in Stilfontein
%in Stilfontein+277-882-255-28 abortion pills for sale in Stilfontein%in Stilfontein+277-882-255-28 abortion pills for sale in Stilfontein
%in Stilfontein+277-882-255-28 abortion pills for sale in Stilfontein
 
The Top App Development Trends Shaping the Industry in 2024-25 .pdf
The Top App Development Trends Shaping the Industry in 2024-25 .pdfThe Top App Development Trends Shaping the Industry in 2024-25 .pdf
The Top App Development Trends Shaping the Industry in 2024-25 .pdf
 
Define the academic and professional writing..pdf
Define the academic and professional writing..pdfDefine the academic and professional writing..pdf
Define the academic and professional writing..pdf
 
How To Troubleshoot Collaboration Apps for the Modern Connected Worker
How To Troubleshoot Collaboration Apps for the Modern Connected WorkerHow To Troubleshoot Collaboration Apps for the Modern Connected Worker
How To Troubleshoot Collaboration Apps for the Modern Connected Worker
 
AI Mastery 201: Elevating Your Workflow with Advanced LLM Techniques
AI Mastery 201: Elevating Your Workflow with Advanced LLM TechniquesAI Mastery 201: Elevating Your Workflow with Advanced LLM Techniques
AI Mastery 201: Elevating Your Workflow with Advanced LLM Techniques
 
Software Quality Assurance Interview Questions
Software Quality Assurance Interview QuestionsSoftware Quality Assurance Interview Questions
Software Quality Assurance Interview Questions
 
The Real-World Challenges of Medical Device Cybersecurity- Mitigating Vulnera...
The Real-World Challenges of Medical Device Cybersecurity- Mitigating Vulnera...The Real-World Challenges of Medical Device Cybersecurity- Mitigating Vulnera...
The Real-World Challenges of Medical Device Cybersecurity- Mitigating Vulnera...
 
Direct Style Effect Systems - The Print[A] Example - A Comprehension Aid
Direct Style Effect Systems -The Print[A] Example- A Comprehension AidDirect Style Effect Systems -The Print[A] Example- A Comprehension Aid
Direct Style Effect Systems - The Print[A] Example - A Comprehension Aid
 
Right Money Management App For Your Financial Goals
Right Money Management App For Your Financial GoalsRight Money Management App For Your Financial Goals
Right Money Management App For Your Financial Goals
 
call girls in Vaishali (Ghaziabad) 🔝 >༒8448380779 🔝 genuine Escort Service 🔝✔️✔️
call girls in Vaishali (Ghaziabad) 🔝 >༒8448380779 🔝 genuine Escort Service 🔝✔️✔️call girls in Vaishali (Ghaziabad) 🔝 >༒8448380779 🔝 genuine Escort Service 🔝✔️✔️
call girls in Vaishali (Ghaziabad) 🔝 >༒8448380779 🔝 genuine Escort Service 🔝✔️✔️
 
%in ivory park+277-882-255-28 abortion pills for sale in ivory park
%in ivory park+277-882-255-28 abortion pills for sale in ivory park %in ivory park+277-882-255-28 abortion pills for sale in ivory park
%in ivory park+277-882-255-28 abortion pills for sale in ivory park
 
Microsoft AI Transformation Partner Playbook.pdf
Microsoft AI Transformation Partner Playbook.pdfMicrosoft AI Transformation Partner Playbook.pdf
Microsoft AI Transformation Partner Playbook.pdf
 

C++ AMP 실천 및 적용 전략

  • 1.
  • 4. 1971, Intel, 4004 4bits processor 740Khz(8 clock cycle/instruction cycle) 2012, Intel, 80637 (Ivy Bridge) 64bits processor 4.1Ghz
  • 5.
  • 6.
  • 9.
  • 11.
  • 13. void AddArrays(int n, int * pA, int * pB, int * pC) { for (int i=0; i<n; i++) { pC[i] = pA[i] + pB[i]; } } void AddArrays(int n, int * pA, int * pB, int * pC) { for (int i=0; i<n; i++) { pC[i] = pA[i] + pB[i]; } } #include <amp.h> using namespace concurrency; void AddArrays(int n, int * pA, int * pB, int * pC) { array_view<int,1> a(n, pA); array_view<int,1> b(n, pB); array_view<int,1> sum(n, pC); parallel_for_each( sum.extent, [=](index<1> i) restrict(amp) { sum[i] = a[i] + b[i]; } ); }
  • 14. void AddArrays(int n, int * pA, int * pB, int * pC) { array_view<int,1> a(n, pA); array_view<int,1> b(n, pB); array_view<int,1> sum(n, pC); parallel_for_each( sum.extent, [=](index<1> i) restrict(amp) { sum[i] = a[i] + b[i]; } ); } parallel_for_each: execute the lambda on the accelerator once per thread extent: the number and shape of threads to execute the lambda index: the thread ID that is running the lambda, used to index into data restrict(amp): tells the compiler to check that this code conforms to C++ AMP language restriction array_view: wrap the data to operate on the accelerator array_view variables captured and associated data copied to accelerator(on demand)
  • 15. index<1> i(2); index<3> i(2,0,1); extent<3> e(3,2,2); index<2> i(0,2); extent<2> e(3,4);extent<1> e(6); N <= 128
  • 16. vector<int> v(10); extent<2> e(2,5); array_view<int,2> a(e, v); //above two lines can also be written //array_view<int,2> a(2,5,v); index<2> i(1,3); int o = a[i]; // or a[i] = 16; //or int o = a(1, 3);
  • 17. 1. parallel_for_each( 2. e, // e is of type extent<N> 3. [ ](index<N> idx) restrict(amp) { // kernel code } 1. );
  • 18.
  • 19.
  • 20. • No • recursion • 'volatile' • virtual functions • pointers to functions • pointers to member functions • pointers in structs • pointers to pointers • bitfields • No • goto or labeled statements • throw, try, catch • globals or statics • dynamic_cast or typeid • asm declarations • varargs • unsupported types • e.g. char, short, long double
  • 21. double cos( double d ); // 1a: cpu code double cos( double d ) restrict(amp); // 1b: amp code double bar( double d ) restrict(cpu,amp); // 2 : common subset of both void some_method(array_view<double,2>& c) { parallel_for_each( c.extent, [=](index<2> idx) restrict(amp) { //… double d0 = c[idx]; double d1 = bar(d0); // ok, bar restrictions include amp double d2 = cos(d0); // ok, chooses amp overload //… }); }
  • 22. void MatrixMultiplySerial( vector<float>& vC, const vector<float>& vA, const vector<float>& vB, int M, int N, int W ) { for (int row = 0; row < M; row++) { for (int col = 0; col < N; col++){ float sum = 0.0f; for(int i = 0; i < W; i++) sum += vA[row * W + i] * vB[i * N + col]; vC[row * N + col] = sum; } } } void MatrixMultiplyAMP( vector<float>& vC, const vector<float>& vA, const vector<float>& vB, int M, int N, int W ) { array_view<const float,2> a(M,W,vA),b(W,N,vB); array_view<float,2> c(M,N,vC); c.discard_data(); parallel_for_each(c.extent, [=](index<2> idx) restrict(amp) { int row = idx[0]; int col = idx[1]; float sum = 0.0f; for(int i = 0; i < W; i++) sum += a(row, i) * b(i, col); c[idx] = sum; } ); }
  • 24. // enumerate all accelerators vector<accelerator> accs = accelerator::get_all(); // choose one based on your criteria accelerator acc = accs[0]; // launch a kernel on it parallel_for_each(acc.default_view, my_extent, [=]…);
  • 25. DEMO
  • 26.
  • 27. vector<int> v(5); iota(v.begin(), v.end(), 0); array<int, 1> a(5, v.begin(), v.end()); parallel_for_each(a.extent, [&](index<1> idx) restrict (amp) { a[idx] = a[idx] * 2; }); copy(a, v.begin()); vector<int> v(5); iota(v.begin(), v.end(), 0); array_view<int, 1> av(5, v); parallel_for_each(av.extent, [=](index<1> idx) restrict (amp) { av[idx] = av[idx] * 2; }); av.synchronize();
  • 28. 1 accelerator_view acc_view = accelerator().default_view; 2 3 int a = 10; 4 std::vector<int> view_data(size); 5 array_view<int, 1> arr_view(size, view_data); 6 std::vector<int> arr_data(size); 7 array<int, 1> arr(size, arr_data.begin(),acc_view); 8
  • 29. 9 // ‘arr_view’ copied by value 10 // ‘a’ copied by value. 11 // ‘arr’ already allocated on accelerator and copied by ref 12 13 parallel_for_each( 14 arr.extent, 15 [=,&arr](index<1> idx) restrict(amp) 16 { 17 arr[idx] = arr_view[idx] + a; 18 arr_view[idx]++; 19 }); 20
  • 30. 21 // ‘arr’ needs to be explictly copied back 22 copy(arr, arr_data.begin()); 23 24 // ‘arr_view’ is synchronized back on first use 25 std::cout << arr_view[0] << std::endl;
  • 31. 1. #include <amp.h> 2. #include <amp_math.h> 3. using namespace concurrency; 4. using namespace concurrency::fast_math; // using namespace concurrency::precise_math; 5. int main() { 6. float a = 2.2f, b = 3.5f; 7. float result = pow(a,b); 8. std::vector<float> v(1); 9. array_view<float> av(1,v); 10. parallel_for_each(av.extent, [=](index<1> idx) restrict(amp) 11. { 12. av[idx] = pow(a,b); 13. }); 14. }
  • 32.
  • 33.
  • 34.
  • 35.
  • 36.
  • 37.
  • 38. DEMO