#include <stdio.h>
#include <assert.h>
#include <inttypes.h>
#include <string.h>
#include <cuda.h>
#define GPULOCAL 1024
#define CheckErr(status) { gpuAssert((status), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, int abort=true) {
if (code != cudaSuccess) {
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
#define MAXN 2048
static int N, M, BN;
static char str[MAXN][MAXN] = {};
static char hostMtx[2][MAXN*MAXN] = {};
__global__ void simulate(char IN[], char OUT[], int N, int BN) {
int globalId = blockIdx.x * blockDim.x + threadIdx.x;
int x = globalId/N + 1, y = globalId%N + 1;
#define G(x, y) IN[(x) * BN + (y)]
char t = G(x, y);
char adj = G(x-1, y-1) + G(x-1, y) + G(x-1, y+1)
+ G(x, y-1) + G(x, y+1) + G(x+1, y-1) + G(x+1, y) + G(x+1, y+1);
OUT[x * BN + y] = (t == 0 && adj == 3) || (t == 1 && (adj == 2 || adj == 3));
#undef G
}
void runCuda() {
int cudaDeviceCnt = 0;
cudaGetDeviceCount(&cudaDeviceCnt);
if (cudaDeviceCnt == 0) {
printf("No supported GPU\n");
return ;
}
char *cuMtx[2];
int memSz = BN*BN*sizeof(char);
int localSz = 1;
for (int i = 1; i <= 1024; i++) {
if (N*N%i == 0)
localSz = i;
}
dim3 cuBlock(localSz);
dim3 cuGrid(N*N/localSz);
cudaMalloc((void **) &cuMtx[0], memSz);
cudaMalloc((void **) &cuMtx[1], memSz);
cudaMemcpy(cuMtx[0], hostMtx[0], memSz, cudaMemcpyHostToDevice);
cudaMemcpy(cuMtx[1], hostMtx[1], memSz, cudaMemcpyHostToDevice);
for (int i = 0; i < M; i++) {
simulate<<<cuGrid, cuBlock>>>(cuMtx[i%2], cuMtx[(i+1)%2], N, BN);
CheckErr(cudaGetLastError());
}
cudaDeviceSynchronize();
int f = M%2;
cudaMemcpy(hostMtx[f], cuMtx[f], memSz, cudaMemcpyDeviceToHost);
for (int i = 1; i <= N; i++) {
for (int j = 1; j <= N; j++)
hostMtx[f][i*BN + j] += '0';
puts(hostMtx[f] + i*BN + 1);
}
}
int main() {
assert(scanf("%d %d", &N, &M) == 2);
while (getchar() != '\n');
for (int i = 1; i <= N; i++)
assert(fgets(str[i]+1, MAXN, stdin) != NULL);
BN = N+2;
for (int i = 1; i <= N; i++) {
for (int j = 1; j <= N; j++)
hostMtx[0][i*BN + j] = str[i][j] - '0';
}
runCuda();
return 0;
}