差分
このページの2つのバージョン間の差分を表示します。
| 次のリビジョン | 前のリビジョン | ||
| web:google:colaboratory [2020/06/10 07:21] – 作成 ともやん | web:google:colaboratory [2020/09/11 16:46] (現在) – ともやん | ||
|---|---|---|---|
| 行 1: | 行 1: | ||
| - | < | ||
| - | < | ||
| - | #result pre, #mincode pre { | ||
| - | overflow: hidden; | ||
| - | font-size: 10px; | ||
| - | } | ||
| - | # | ||
| - | height: 250px; | ||
| - | overflow: scroll; | ||
| - | overflow-x: hidden; | ||
| - | font-size: 10px; | ||
| - | } | ||
| - | #mintbl table { | ||
| - | font-size: 12px; | ||
| - | } | ||
| - | #mintbl td pre { | ||
| - | margin: 0; | ||
| - | } | ||
| - | #img_long { | ||
| - | height: 400px; | ||
| - | overflow: scroll; | ||
| - | overflow-x: hidden; | ||
| - | } | ||
| - | .dokuwiki .plugin_wrap table { | ||
| - | width: auto; | ||
| - | } | ||
| - | #logo { | ||
| - | background-color: | ||
| - | padding: 10px; | ||
| - | width: fit-content; | ||
| - | } | ||
| - | #logo p { | ||
| - | margin: 0; | ||
| - | } | ||
| - | </ | ||
| - | </ | ||
| ====== Google Colaboratory (略称: Colab) ====== | ====== Google Colaboratory (略称: Colab) ====== | ||
| 行 120: | 行 84: | ||
| NUMA node0 CPU(s): | NUMA node0 CPU(s): | ||
| Flags: | Flags: | ||
| + | </ | ||
| + | </ | ||
| + | |||
| + | ===== OpenCL ===== | ||
| + | <WRAP prewrap 100%> | ||
| + | <code python> | ||
| + | !clinfo | ||
| + | </ | ||
| + | </ | ||
| + | <WRAP prewrap 100% #result> | ||
| + | <code python> | ||
| + | Number of platforms | ||
| + | </ | ||
| + | </ | ||
| + | |||
| + | メニューの [ランタイム] - [ランタイムのタイプを変更] で「ノートブックの設定」の「ハードウェア アクセラレータ」を設定する。\\ | ||
| + | |||
| + | **ハードウェア アクセラレータ: | ||
| + | <WRAP prewrap 100% # | ||
| + | <code python> | ||
| + | Number of platforms | ||
| + | Platform Name | ||
| + | Platform Vendor | ||
| + | Platform Version | ||
| + | Platform Profile | ||
| + | Platform Extensions | ||
| + | Platform Extensions function suffix | ||
| + | |||
| + | Platform Name | ||
| + | Number of devices | ||
| + | Device Name Tesla T4 | ||
| + | Device Vendor | ||
| + | Device Vendor ID 0x10de | ||
| + | Device Version | ||
| + | Driver Version | ||
| + | Device OpenCL C Version | ||
| + | Device Type GPU | ||
| + | Device Topology (NV) PCI-E, 00:00.4 | ||
| + | Device Profile | ||
| + | Device Available | ||
| + | Compiler Available | ||
| + | Linker Available | ||
| + | Max compute units 40 | ||
| + | Max clock frequency | ||
| + | Compute Capability (NV) 7.5 | ||
| + | Device Partition | ||
| + | Max number of sub-devices | ||
| + | Supported partition types None | ||
| + | Max work item dimensions | ||
| + | Max work item sizes | ||
| + | Max work group size 1024 | ||
| + | Preferred work group size multiple | ||
| + | Warp size (NV) 32 | ||
| + | Preferred / native vector sizes | ||
| + | char 1 / 1 | ||
| + | short 1 / 1 | ||
| + | int 1 / 1 | ||
| + | long 1 / 1 | ||
| + | half 0 / 0 (n/a) | ||
| + | float 1 / 1 | ||
| + | double | ||
| + | Half-precision Floating-point support | ||
| + | Single-precision Floating-point support | ||
| + | Denormals | ||
| + | Infinity and NANs Yes | ||
| + | Round to nearest | ||
| + | Round to zero Yes | ||
| + | Round to infinity | ||
| + | IEEE754-2008 fused multiply-add | ||
| + | Support is emulated in software | ||
| + | Correctly-rounded divide and sqrt operations | ||
| + | Double-precision Floating-point support | ||
| + | Denormals | ||
| + | Infinity and NANs Yes | ||
| + | Round to nearest | ||
| + | Round to zero Yes | ||
| + | Round to infinity | ||
| + | IEEE754-2008 fused multiply-add | ||
| + | Support is emulated in software | ||
| + | Address bits 64, Little-Endian | ||
| + | Global memory size 15812263936 (14.73GiB) | ||
| + | Error Correction support | ||
| + | Max memory allocation | ||
| + | Unified memory for Host and Device | ||
| + | Integrated memory (NV) No | ||
| + | Minimum alignment for any data type 128 bytes | ||
| + | Alignment of base address | ||
| + | Global Memory cache type Read/Write | ||
| + | Global Memory cache size 655360 (640KiB) | ||
| + | Global Memory cache line size 128 bytes | ||
| + | Image support | ||
| + | Max number of samplers per kernel | ||
| + | Max size for 1D images from buffer | ||
| + | Max 1D or 2D image array size 2048 images | ||
| + | Max 2D image size | ||
| + | Max 3D image size | ||
| + | Max number of read image args 256 | ||
| + | Max number of write image args 32 | ||
| + | Local memory type Local | ||
| + | Local memory size 49152 (48KiB) | ||
| + | Registers per block (NV) 65536 | ||
| + | Max number of constant args 9 | ||
| + | Max constant buffer size 65536 (64KiB) | ||
| + | Max size of kernel argument | ||
| + | Queue properties | ||
| + | Out-of-order execution | ||
| + | Profiling | ||
| + | Prefer user sync for interop | ||
| + | Profiling timer resolution | ||
| + | Execution capabilities | ||
| + | Run OpenCL kernels | ||
| + | Run native kernels | ||
| + | Kernel execution timeout (NV) No | ||
| + | Concurrent copy and kernel execution (NV) Yes | ||
| + | Number of async copy engines | ||
| + | printf() buffer size 1048576 (1024KiB) | ||
| + | Built-in kernels | ||
| + | Device Extensions | ||
| + | |||
| + | NULL platform behavior | ||
| + | clGetPlatformInfo(NULL, | ||
| + | clGetDeviceIDs(NULL, | ||
| + | clCreateContext(NULL, | ||
| + | clCreateContext(NULL, | ||
| + | clCreateContextFromType(NULL, | ||
| + | clCreateContextFromType(NULL, | ||
| + | clCreateContextFromType(NULL, | ||
| + | clCreateContextFromType(NULL, | ||
| + | clCreateContextFromType(NULL, | ||
| + | clCreateContextFromType(NULL, | ||
| + | </ | ||
| + | </ | ||
| + | |||
| + | ==== PyOpenCL で OpenCL ベンチマーク ==== | ||
| + | [[python: | ||
| + | |||
| + | **PyOpenCL** をインストールする。\\ | ||
| + | <WRAP prewrap 100%> | ||
| + | <code python> | ||
| + | !pip install pyopencl | ||
| + | </ | ||
| + | </ | ||
| + | <WRAP prewrap 100% #result> | ||
| + | <code python> | ||
| + | Collecting pyopencl | ||
| + | Downloading https:// | ||
| + |  | ||
| + | Requirement already satisfied: decorator> | ||
| + | Collecting pytools> | ||
| + | Downloading https:// | ||
| + |  | ||
| + | Collecting appdirs> | ||
| + | Downloading https:// | ||
| + | Requirement already satisfied: six> | ||
| + | Requirement already satisfied: numpy in / | ||
| + | Building wheels for collected packages: pytools | ||
| + | Building wheel for pytools (setup.py) ... done | ||
| + | Created wheel for pytools: filename=pytools-2020.2-py2.py3-none-any.whl size=62338 sha256=9aa0450004dbf633f7584e5914d50999d697d018a341b86a7c499bc1fbfd5281 | ||
| + | Stored in directory: / | ||
| + | Successfully built pytools | ||
| + | Installing collected packages: appdirs, pytools, pyopencl | ||
| + | Successfully installed appdirs-1.4.4 pyopencl-2020.1 pytools-2020.2 | ||
| + | </ | ||
| + | </ | ||
| + | |||
| + | **benchmark-all.py** を保存する。\\ | ||
| + | <WRAP prewrap 100% # | ||
| + | <code python> | ||
| + | %%file benchmark-all.py | ||
| + | # example provided by Roger Pau Monn'e | ||
| + | |||
| + | import pyopencl as cl | ||
| + | import numpy | ||
| + | import numpy.linalg as la | ||
| + | import datetime | ||
| + | from time import time | ||
| + | |||
| + | a = numpy.random.rand(1000).astype(numpy.float32) | ||
| + | b = numpy.random.rand(1000).astype(numpy.float32) | ||
| + | c_result = numpy.empty_like(a) | ||
| + | |||
| + | # Speed in normal CPU usage | ||
| + | time1 = time() | ||
| + | for i in range(1000): | ||
| + | for j in range(1000): | ||
| + | c_result[i] = a[i] + b[i] | ||
| + | c_result[i] = c_result[i] * (a[i] + b[i]) | ||
| + | c_result[i] = c_result[i] * (a[i] / 2.0) | ||
| + | time2 = time() | ||
| + | print(" | ||
| + | |||
| + | |||
| + | for platform in cl.get_platforms(): | ||
| + | for device in platform.get_devices(): | ||
| + | print(" | ||
| + | print(" | ||
| + | print(" | ||
| + | print(" | ||
| + | print(" | ||
| + | print(" | ||
| + | print(" | ||
| + | print(" | ||
| + | print(" | ||
| + | print(" | ||
| + | print(" | ||
| + | |||
| + | # Simnple speed test | ||
| + | ctx = cl.Context([device]) | ||
| + | queue = cl.CommandQueue(ctx, | ||
| + | properties=cl.command_queue_properties.PROFILING_ENABLE) | ||
| + | |||
| + | mf = cl.mem_flags | ||
| + | a_buf = cl.Buffer(ctx, | ||
| + | b_buf = cl.Buffer(ctx, | ||
| + | dest_buf = cl.Buffer(ctx, | ||
| + | |||
| + | prg = cl.Program(ctx, | ||
| + | __kernel void sum(__global const float *a, | ||
| + | __global const float *b, __global float *c) | ||
| + | { | ||
| + | int loop; | ||
| + | int gid = get_global_id(0); | ||
| + | for(loop=0; loop< | ||
| + | { | ||
| + | c[gid] = a[gid] + b[gid]; | ||
| + | c[gid] = c[gid] * (a[gid] + b[gid]); | ||
| + | c[gid] = c[gid] * (a[gid] / 2.0); | ||
| + | } | ||
| + | } | ||
| + | """ | ||
| + | |||
| + | exec_evt = prg.sum(queue, | ||
| + | exec_evt.wait() | ||
| + | elapsed = 1e-9*(exec_evt.profile.end - exec_evt.profile.start) | ||
| + | |||
| + | # | ||
| + | print(" | ||
| + | |||
| + | c = numpy.empty_like(a) | ||
| + | # | ||
| + | cl.enqueue_copy(queue, | ||
| + | error = 0 | ||
| + | for i in range(1000): | ||
| + | if c[i] != c_result[i]: | ||
| + | error = 1 | ||
| + | if error: | ||
| + | print(" | ||
| + | else: | ||
| + | print(" | ||
| + | </ | ||
| + | </ | ||
| + | |||
| + | **benchmark-all.py** を実行する。\\ | ||
| + | <WRAP prewrap 100%> | ||
| + | <code python> | ||
| + | %run benchmark-all.py | ||
| + | </ | ||
| + | </ | ||
| + | <WRAP prewrap 100% #result> | ||
| + | <code python> | ||
| + | Execution time of test without OpenCL: | ||
| + | =============================================================== | ||
| + | Platform name: NVIDIA CUDA | ||
| + | Platform profile: FULL_PROFILE | ||
| + | Platform vendor: NVIDIA Corporation | ||
| + | Platform version: OpenCL 1.2 CUDA 10.1.152 | ||
| + | --------------------------------------------------------------- | ||
| + | Device name: Tesla P4 | ||
| + | Device type: ALL | GPU | ||
| + | Device memory: | ||
| + | Device max clock speed: 1113 MHz | ||
| + | Device compute units: 20 | ||
| + | Execution time of test: 0.0010557440 s | ||
| + | Results OK | ||
| </ | </ | ||
| </ | </ | ||
| ===== 現在の GPU の割り当て状況 ===== | ===== 現在の GPU の割り当て状況 ===== | ||
| - | <WRAP prewrap 100% #mincode> | + | <WRAP prewrap 100%> | 
| <code python> | <code python> | ||
| !nvidia-smi | !nvidia-smi | ||
