差分
このページの2つのバージョン間の差分を表示します。
次のリビジョン | 前のリビジョン | ||
web:google:colaboratory [2020/06/10 07:21] – 作成 ともやん | web:google:colaboratory [2020/09/11 16:46] (現在) – ともやん | ||
---|---|---|---|
行 1: | 行 1: | ||
- | < | ||
- | < | ||
- | #result pre, #mincode pre { | ||
- | overflow: hidden; | ||
- | font-size: 10px; | ||
- | } | ||
- | # | ||
- | height: 250px; | ||
- | overflow: scroll; | ||
- | overflow-x: hidden; | ||
- | font-size: 10px; | ||
- | } | ||
- | #mintbl table { | ||
- | font-size: 12px; | ||
- | } | ||
- | #mintbl td pre { | ||
- | margin: 0; | ||
- | } | ||
- | #img_long { | ||
- | height: 400px; | ||
- | overflow: scroll; | ||
- | overflow-x: hidden; | ||
- | } | ||
- | .dokuwiki .plugin_wrap table { | ||
- | width: auto; | ||
- | } | ||
- | #logo { | ||
- | background-color: | ||
- | padding: 10px; | ||
- | width: fit-content; | ||
- | } | ||
- | #logo p { | ||
- | margin: 0; | ||
- | } | ||
- | </ | ||
- | </ | ||
====== Google Colaboratory (略称: Colab) ====== | ====== Google Colaboratory (略称: Colab) ====== | ||
行 120: | 行 84: | ||
NUMA node0 CPU(s): | NUMA node0 CPU(s): | ||
Flags: | Flags: | ||
+ | </ | ||
+ | </ | ||
+ | |||
+ | ===== OpenCL ===== | ||
+ | <WRAP prewrap 100%> | ||
+ | <code python> | ||
+ | !clinfo | ||
+ | </ | ||
+ | </ | ||
+ | <WRAP prewrap 100% #result> | ||
+ | <code python> | ||
+ | Number of platforms | ||
+ | </ | ||
+ | </ | ||
+ | |||
+ | メニューの [ランタイム] - [ランタイムのタイプを変更] で「ノートブックの設定」の「ハードウェア アクセラレータ」を設定する。\\ | ||
+ | |||
+ | **ハードウェア アクセラレータ: | ||
+ | <WRAP prewrap 100% # | ||
+ | <code python> | ||
+ | Number of platforms | ||
+ | Platform Name | ||
+ | Platform Vendor | ||
+ | Platform Version | ||
+ | Platform Profile | ||
+ | Platform Extensions | ||
+ | Platform Extensions function suffix | ||
+ | |||
+ | Platform Name | ||
+ | Number of devices | ||
+ | Device Name Tesla T4 | ||
+ | Device Vendor | ||
+ | Device Vendor ID 0x10de | ||
+ | Device Version | ||
+ | Driver Version | ||
+ | Device OpenCL C Version | ||
+ | Device Type GPU | ||
+ | Device Topology (NV) PCI-E, 00:00.4 | ||
+ | Device Profile | ||
+ | Device Available | ||
+ | Compiler Available | ||
+ | Linker Available | ||
+ | Max compute units 40 | ||
+ | Max clock frequency | ||
+ | Compute Capability (NV) 7.5 | ||
+ | Device Partition | ||
+ | Max number of sub-devices | ||
+ | Supported partition types None | ||
+ | Max work item dimensions | ||
+ | Max work item sizes | ||
+ | Max work group size 1024 | ||
+ | Preferred work group size multiple | ||
+ | Warp size (NV) 32 | ||
+ | Preferred / native vector sizes | ||
+ | char 1 / 1 | ||
+ | short 1 / 1 | ||
+ | int 1 / 1 | ||
+ | long 1 / 1 | ||
+ | half 0 / 0 (n/a) | ||
+ | float 1 / 1 | ||
+ | double | ||
+ | Half-precision Floating-point support | ||
+ | Single-precision Floating-point support | ||
+ | Denormals | ||
+ | Infinity and NANs Yes | ||
+ | Round to nearest | ||
+ | Round to zero Yes | ||
+ | Round to infinity | ||
+ | IEEE754-2008 fused multiply-add | ||
+ | Support is emulated in software | ||
+ | Correctly-rounded divide and sqrt operations | ||
+ | Double-precision Floating-point support | ||
+ | Denormals | ||
+ | Infinity and NANs Yes | ||
+ | Round to nearest | ||
+ | Round to zero Yes | ||
+ | Round to infinity | ||
+ | IEEE754-2008 fused multiply-add | ||
+ | Support is emulated in software | ||
+ | Address bits 64, Little-Endian | ||
+ | Global memory size 15812263936 (14.73GiB) | ||
+ | Error Correction support | ||
+ | Max memory allocation | ||
+ | Unified memory for Host and Device | ||
+ | Integrated memory (NV) No | ||
+ | Minimum alignment for any data type 128 bytes | ||
+ | Alignment of base address | ||
+ | Global Memory cache type Read/Write | ||
+ | Global Memory cache size 655360 (640KiB) | ||
+ | Global Memory cache line size 128 bytes | ||
+ | Image support | ||
+ | Max number of samplers per kernel | ||
+ | Max size for 1D images from buffer | ||
+ | Max 1D or 2D image array size 2048 images | ||
+ | Max 2D image size | ||
+ | Max 3D image size | ||
+ | Max number of read image args 256 | ||
+ | Max number of write image args 32 | ||
+ | Local memory type Local | ||
+ | Local memory size 49152 (48KiB) | ||
+ | Registers per block (NV) 65536 | ||
+ | Max number of constant args 9 | ||
+ | Max constant buffer size 65536 (64KiB) | ||
+ | Max size of kernel argument | ||
+ | Queue properties | ||
+ | Out-of-order execution | ||
+ | Profiling | ||
+ | Prefer user sync for interop | ||
+ | Profiling timer resolution | ||
+ | Execution capabilities | ||
+ | Run OpenCL kernels | ||
+ | Run native kernels | ||
+ | Kernel execution timeout (NV) No | ||
+ | Concurrent copy and kernel execution (NV) Yes | ||
+ | Number of async copy engines | ||
+ | printf() buffer size 1048576 (1024KiB) | ||
+ | Built-in kernels | ||
+ | Device Extensions | ||
+ | |||
+ | NULL platform behavior | ||
+ | clGetPlatformInfo(NULL, | ||
+ | clGetDeviceIDs(NULL, | ||
+ | clCreateContext(NULL, | ||
+ | clCreateContext(NULL, | ||
+ | clCreateContextFromType(NULL, | ||
+ | clCreateContextFromType(NULL, | ||
+ | clCreateContextFromType(NULL, | ||
+ | clCreateContextFromType(NULL, | ||
+ | clCreateContextFromType(NULL, | ||
+ | clCreateContextFromType(NULL, | ||
+ | </ | ||
+ | </ | ||
+ | |||
+ | ==== PyOpenCL で OpenCL ベンチマーク ==== | ||
+ | [[python: | ||
+ | |||
+ | **PyOpenCL** をインストールする。\\ | ||
+ | <WRAP prewrap 100%> | ||
+ | <code python> | ||
+ | !pip install pyopencl | ||
+ | </ | ||
+ | </ | ||
+ | <WRAP prewrap 100% #result> | ||
+ | <code python> | ||
+ | Collecting pyopencl | ||
+ | Downloading https:// | ||
+ | | ||
+ | Requirement already satisfied: decorator> | ||
+ | Collecting pytools> | ||
+ | Downloading https:// | ||
+ | | ||
+ | Collecting appdirs> | ||
+ | Downloading https:// | ||
+ | Requirement already satisfied: six> | ||
+ | Requirement already satisfied: numpy in / | ||
+ | Building wheels for collected packages: pytools | ||
+ | Building wheel for pytools (setup.py) ... done | ||
+ | Created wheel for pytools: filename=pytools-2020.2-py2.py3-none-any.whl size=62338 sha256=9aa0450004dbf633f7584e5914d50999d697d018a341b86a7c499bc1fbfd5281 | ||
+ | Stored in directory: / | ||
+ | Successfully built pytools | ||
+ | Installing collected packages: appdirs, pytools, pyopencl | ||
+ | Successfully installed appdirs-1.4.4 pyopencl-2020.1 pytools-2020.2 | ||
+ | </ | ||
+ | </ | ||
+ | |||
+ | **benchmark-all.py** を保存する。\\ | ||
+ | <WRAP prewrap 100% # | ||
+ | <code python> | ||
+ | %%file benchmark-all.py | ||
+ | # example provided by Roger Pau Monn'e | ||
+ | |||
+ | import pyopencl as cl | ||
+ | import numpy | ||
+ | import numpy.linalg as la | ||
+ | import datetime | ||
+ | from time import time | ||
+ | |||
+ | a = numpy.random.rand(1000).astype(numpy.float32) | ||
+ | b = numpy.random.rand(1000).astype(numpy.float32) | ||
+ | c_result = numpy.empty_like(a) | ||
+ | |||
+ | # Speed in normal CPU usage | ||
+ | time1 = time() | ||
+ | for i in range(1000): | ||
+ | for j in range(1000): | ||
+ | c_result[i] = a[i] + b[i] | ||
+ | c_result[i] = c_result[i] * (a[i] + b[i]) | ||
+ | c_result[i] = c_result[i] * (a[i] / 2.0) | ||
+ | time2 = time() | ||
+ | print(" | ||
+ | |||
+ | |||
+ | for platform in cl.get_platforms(): | ||
+ | for device in platform.get_devices(): | ||
+ | print(" | ||
+ | print(" | ||
+ | print(" | ||
+ | print(" | ||
+ | print(" | ||
+ | print(" | ||
+ | print(" | ||
+ | print(" | ||
+ | print(" | ||
+ | print(" | ||
+ | print(" | ||
+ | |||
+ | # Simnple speed test | ||
+ | ctx = cl.Context([device]) | ||
+ | queue = cl.CommandQueue(ctx, | ||
+ | properties=cl.command_queue_properties.PROFILING_ENABLE) | ||
+ | |||
+ | mf = cl.mem_flags | ||
+ | a_buf = cl.Buffer(ctx, | ||
+ | b_buf = cl.Buffer(ctx, | ||
+ | dest_buf = cl.Buffer(ctx, | ||
+ | |||
+ | prg = cl.Program(ctx, | ||
+ | __kernel void sum(__global const float *a, | ||
+ | __global const float *b, __global float *c) | ||
+ | { | ||
+ | int loop; | ||
+ | int gid = get_global_id(0); | ||
+ | for(loop=0; loop< | ||
+ | { | ||
+ | c[gid] = a[gid] + b[gid]; | ||
+ | c[gid] = c[gid] * (a[gid] + b[gid]); | ||
+ | c[gid] = c[gid] * (a[gid] / 2.0); | ||
+ | } | ||
+ | } | ||
+ | """ | ||
+ | |||
+ | exec_evt = prg.sum(queue, | ||
+ | exec_evt.wait() | ||
+ | elapsed = 1e-9*(exec_evt.profile.end - exec_evt.profile.start) | ||
+ | |||
+ | # | ||
+ | print(" | ||
+ | |||
+ | c = numpy.empty_like(a) | ||
+ | # | ||
+ | cl.enqueue_copy(queue, | ||
+ | error = 0 | ||
+ | for i in range(1000): | ||
+ | if c[i] != c_result[i]: | ||
+ | error = 1 | ||
+ | if error: | ||
+ | print(" | ||
+ | else: | ||
+ | print(" | ||
+ | </ | ||
+ | </ | ||
+ | |||
+ | **benchmark-all.py** を実行する。\\ | ||
+ | <WRAP prewrap 100%> | ||
+ | <code python> | ||
+ | %run benchmark-all.py | ||
+ | </ | ||
+ | </ | ||
+ | <WRAP prewrap 100% #result> | ||
+ | <code python> | ||
+ | Execution time of test without OpenCL: | ||
+ | =============================================================== | ||
+ | Platform name: NVIDIA CUDA | ||
+ | Platform profile: FULL_PROFILE | ||
+ | Platform vendor: NVIDIA Corporation | ||
+ | Platform version: OpenCL 1.2 CUDA 10.1.152 | ||
+ | --------------------------------------------------------------- | ||
+ | Device name: Tesla P4 | ||
+ | Device type: ALL | GPU | ||
+ | Device memory: | ||
+ | Device max clock speed: 1113 MHz | ||
+ | Device compute units: 20 | ||
+ | Execution time of test: 0.0010557440 s | ||
+ | Results OK | ||
</ | </ | ||
</ | </ | ||
===== 現在の GPU の割り当て状況 ===== | ===== 現在の GPU の割り当て状況 ===== | ||
- | <WRAP prewrap 100% #mincode> | + | <WRAP prewrap 100%> |
<code python> | <code python> | ||
!nvidia-smi | !nvidia-smi |