Hi,
Is there a way to skip checksum while loading a file using Hadoop's C API?
I have a sample program that just counts the number of spaces in a HDFS
file -
int main(int argc, char *argv[]) {
long count = 0;
long buflen = 2147483645; // Integer.MAX_VALUE - 2
char path[35] = "hdfs://127.0.0.1:9000";
strcat(path, argv[1]);
hdfsFS x2 = hdfsConnect("127.0.0.1", 9000);
hdfsFile x3 = hdfsOpenFile(x2, path, 0, 0, 0, 0);
hdfsFileInfo *info;
info = hdfsGetPathInfo(x2, path);
tOffset length = info->mSize;
char *buffer = calloc(buflen, sizeof(char));
long i = 0;
while (i < length) {
long diff = length - i;
int toread = ((long)buflen) > diff ? (int)diff : buflen;
hdfsPread(x2, x3, i, buffer, toread);
for (int j = 0; j<toread; j++) {
if (buffer[j] != ' ') {
count++;
}
}
i += toread;
}
printf("%ld\n", count);
free(buffer);
hdfsFreeFileInfo(info, 1);
hdfsCloseFile(x2, x3);
return 1;
}
Is there a way to skip checksum for the above operation over an HDFS file?
I know that there is a way to skip checksum using
hadoopRzOptionsSetSkipChecksum(), but that uses different API calls to load
the file.
Thanks,
--
Pratyush Das