2009/9/12 Alan Woodland <alan.woodl...@gmail.com>:
> I'll try and convert the I've got into a repeatable, automated test
> and then add it to this bug report in a bit too.
As promised I've attached a somewhat crude test for the BLCR
checkpointing to this email. run_test.sh compiles and runs everything.

Alan
#define _GNU_SOURCE
#include <unistd.h>
#include <stdlib.h>
#include <poll.h>
#include <assert.h>
#include <sys/types.h>
#include <stdio.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <string.h>
#include <sys/wait.h>

/*
 * Test harness for MPI BLCR checkpointing. Depends upon mpi_test_blcr.c,
 * a modified version of ring_c.c that ships with the OpenMPI examples.
 * (C) 2009 Alan Woodland <awoodl...@debian.org>,  under the same terms 
 * as OpenMPI.
 */

int main() {
  int pipes[2];
  if (pipe(pipes)) {
	 perror("pipe");
	 exit(-1);
  }

  int fds[RANK];
  for (int i = 0; i < RANK; ++i) {
	 char fname[50];
	 sprintf(fname, "%d_run.log", i);
	 // Create it if it doesn't exist yet anyway, truncate it if it does
	 fds[i] = open(fname, O_CLOEXEC|O_CREAT|O_TRUNC|O_RDWR);
	 if (!fds[i]) {
		perror("open");
		exit(-1);
	 }
  }

  pid_t child = fork();
  if (-1 == child) {
	 perror("fork");
	 exit(-1);
  }  

  if (child) {
	 int counter = 10;
	 // parent
	 close(pipes[0]); // close the read end
	 FILE *master = fopen("0_run.log", "r");
	 if (!master) {
		perror("fopen");
		exit(-1);
	 }
	 while (counter != 6) {
		int r, n;
		const int got = fscanf(master, "%d:%d", &r, &n);
		if (got <= 0)
		  continue;
		assert(got == 2);
		assert(r == 0);
		assert(n == counter -1 || (n == counter && n == 10));
		counter = n;
		fprintf(stderr, "Counter is now: %d\n", counter);
	 }
	 fclose(master);
	 fprintf(stderr, "Counter=6, doing checkpoint\n");

	 char cmd[2048];
	 char snapshotid[2048];
	 sprintf(cmd, "ompi-checkpoint %d", child);
	 FILE *chkpt = popen(cmd, "r");
	 write(pipes[1], "\n", 1);
	 assert(chkpt);
	 while (!feof(chkpt)) fscanf(chkpt, "%s\n", snapshotid);
	 int result = pclose(chkpt);
	 assert(!result);
	 
	 fprintf(stderr, "Snapshot done: %s\n", snapshotid);
	 kill(child, SIGINT);

	 // check everyone hit 6 ok:
	 for (int i = 0; i < RANK; ++i) {
		char buf[2048];
		ssize_t got = read(fds[i], buf, 2048);
		int scanned;
		FILE *stream = fmemopen(buf, got, "r");
		int lastn = -1;
		while (!feof(stream)) {
		  int r, n;
		  fscanf(stream, "%d:%d\n", &r, &n);
		  assert(r == i);
		  assert(n == lastn - 1 || lastn < 0);
		  assert(lastn >= 0 || n == 10);
		  lastn = n;
		}
		fclose(stream);
		assert(lastn <= 6);
		fprintf(stderr, "Log from: %d passed stage1! (Final=%d)\n", i, lastn);
	 }

	 // Restart it and check everything hits 0
	 sprintf(cmd, "ompi-restart %s", snapshotid);
	 FILE *restarted = popen(cmd, "w");
	 assert(restarted);
	 result = pclose(restarted);
	 assert(!result);
	 
	 fprintf(stderr, "Restart done\n");
	 // check everyone hit 0 ok:
	 for (int i = 0; i < RANK; ++i) {
		char buf[2048];
		ssize_t got = read(fds[i], buf, 2048);
		int scanned;
		FILE *stream = fmemopen(buf, got, "r");
		int lastn = -1;
		while (!feof(stream)) {
		  int r, n;
		  fscanf(stream, "%d:%d\n", &r, &n);
		  assert(r == i);
		  assert(n == lastn - 1 || lastn < 0);
		  lastn = n;
		}
		fclose(stream);
		assert(lastn == 0 || (!i && lastn == 1));
		fprintf(stderr, "Log from: %d passed stage2! (Final=%d)\n", i, lastn);
	 }

  }
  else {
	 char rank[10];
	 sprintf(rank, "%d", RANK);
	 // child
	 close(pipes[1]); // close the write end
	 if (-1 == dup2(pipes[0], STDIN_FILENO)) {
		perror("dup2");
		exit(-1);
	 }
	 close(pipes[0]);
                    	 // Debian-ism! 
	 int result = execlp("mpirun.openmpi", "mpirun", "-np", rank, "-am", "ft-enable-cr", "./a.out", NULL);
	 perror("execlp");
	 exit(-1);
  }

  return 0;
}
/*
 * Copyright (c) 2004-2006 The Trustees of Indiana University and Indiana
 *                         University Research and Technology
 *                         Corporation.  All rights reserved.
 * Copyright (c) 2006      Cisco Systems, Inc.  All rights reserved.
 *
 * Simple ring test program
 *
 * Modifications for checkpoint testing added 2009, awoodl...@debian.org
 */

#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include "mpi.h"

int main(int argc, char *argv[])
{
    int rank, size, next, prev, message, tag = 201;

    /* Start up MPI */

    MPI_Init(&argc, &argv);
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
    MPI_Comm_size(MPI_COMM_WORLD, &size);
 
    /* Calculate the rank of the next process in the ring.  Use the
       modulus operator so that the last process "wraps around" to
       rank zero. */

    next = (rank + 1) % size;
    prev = (rank + size - 1) % size;
	 
	 char fname[50];
	 sprintf(fname, "%d_run.log", rank);
	 FILE *log = fopen(fname, "w");
	 if (!log) {
		perror("fopen");
		exit(-1);
	 }

    /* If we are the "master" process (i.e., MPI_COMM_WORLD rank 0),
       put the number of times to go around the ring in the
       message. */

    if (0 == rank) {
        message = 10;

        printf("Process 0 sending %d to %d, tag %d (%d processes in ring)\n", 
               message, next, tag, size);
        MPI_Send(&message, 1, MPI_INT, next, tag, MPI_COMM_WORLD); 
        printf("Process 0 sent to %d\n", next);
    }

    /* Pass the message around the ring.  The exit mechanism works as
       follows: the message (a positive integer) is passed around the
       ring.  Each time it passes rank 0, it is decremented.  When
       each processes receives a message containing a 0 value, it
       passes the message on to the next process and then quits.  By
       passing the 0 message first, every process gets the 0 message
       and can quit normally. */

    while (1) {
        MPI_Recv(&message, 1, MPI_INT, prev, tag, MPI_COMM_WORLD, 
                 MPI_STATUS_IGNORE);

		  usleep(100000);

		  fprintf(log, "%d:%d\n", rank, message);
		  // Make sure we flush
		  fflush(log);

        if (0 == rank) {
            --message;
            printf("Process 0 decremented value: %d\n", message);
				if (5 == message) {
				  // block so we can take a checkpoint
				  int ch = getchar();
				  if (EOF == ch) {
					 exit(-1);
				  }
				}
				/*				else if (4 == message) {
				  // and again so the snapshot happens
				  int ch = getchar();
				  if (EOF == ch) {
					 exit(-1);
				  }
				  }*/
        }

        MPI_Send(&message, 1, MPI_INT, next, tag, MPI_COMM_WORLD);
        if (0 == message) {
            printf("Process %d exiting\n", rank);
            break;
        }
    }

    /* The last process does one extra send to process 0, which needs
       to be received before the program can exit */

    if (0 == rank) {
        MPI_Recv(&message, 1, MPI_INT, prev, tag, MPI_COMM_WORLD,
                 MPI_STATUS_IGNORE);
    }
    
    /* All done */

    MPI_Finalize();

	 fclose(log);
    return 0;
}

Attachment: run_test.sh
Description: Bourne shell script

Reply via email to