#!/usr/local/bin/perl -w

# MITgcmUV DataSet joining utility.
# Tested with perl 4.0 and newer.
# Tested on Linux 2.0.27/I486, Irix 6.2/{IP22,IP25}
# Zhangfan XING, xing@pacific.jpl.nasa.gov
# Adapted to work with MDS I/O format by adcroft@mit.edu, 5/7/1999
#
# LOGS:
# 980707, version 0.0.1, basically works
# 980721, version 0.2.0, proper handling of data file's header and terminator
#         for diff bytesex.
# 990507, HACK'd by AJA. Needs to be properly merged with the original joinds

#------
# usage
#------
sub usage {
	print STDERR
		"\nUsage:$0 [-Ddir0 -Ddir1 ...] " .
		"prefix suffix [(little-endian|big-endian)]\n"; 
	print STDERR "\nMITgcmUV DataSet joining utility, version 0.3.0\n";
	print STDERR
	"Check http://escher.jpl.nasa.gov:2000/tools/ for newer version.\n";
	print STDERR "Report problem to xing\@pacific.jpl.nasa.gov\n\n";
	exit 1;
}

#------------------------------
# product of a list of integers
#------------------------------
sub listprod {
	local ($product) = 1;
	local ($x);
	foreach $x (@_) {
		$product *= $x;
	}
	$product;
}

#----------------
# @list1 + @list2
#----------------
sub lists_add {
	local (*l1,*l2) = @_;
	($#l1 == $#l2) || return undef;

	local (@l);
	for (local($i)=0;$i<=$#l1;$i++) {
		$l[$i]=$l1[$i]+$l2[$i];
	}
	@l;
}

#-------------
# pos to index
# 0-based.
#-------------
sub pos2index {

	local ($pos,@dim) = @_;
	local ($rightmost) = pop(@dim);

	local (@index,$d);
	foreach $d (@dim) {
		push(@index,$pos%$d);
		$pos = int($pos/$d);
	}

	# self-guarding
	unless ($rightmost > $pos) {
		return undef;
	}

	push(@index,$pos);
	@index;
}

#-------------
# index to pos
# 0-based.
#-------------
sub index2pos {
	local (*index,*dim) = @_;

	return undef unless ($#index == $#dim);

	local ($pos) = $index[$#index];
	for (local($i)=$#dim;$i>0;$i--) {
		$pos = $pos * $dim[$i-1] + $index[$i-1];
	}
	$pos;
}

#-------------------------
# check machine's bytesex.
# returns "little-endian" or "big-endian"
# or dies if unable to figure out
#-------------------------
sub mach_bytesex {

	local ($foo)  = pack("s2",1,2);
	if ($foo eq "\1\0\2\0") {
		return "little-endian";
	} elsif ($foo eq "\0\1\0\2") {
		return "big-endian";
	} else {
		die "Your machine has a strange bytesex.\n".
		"Email your platform info to xing\@pacific.jpl.nasa.gov\n";
	}
}

#--------------------------------------------------
# check bytesex of a fortran unformatted data file
# current machine's bytesex is used as a reference.
# returns: one of "little-endian", "big-endian", "undecidable" and "unknown"
#--------------------------------------------------
sub file_bytesex {

	# only if this platform's bytesex is either big- or little-endian
	# otherwise dies. Hope this won't happen.
	local($mach_bytesex) = &mach_bytesex();

	local ($file) = shift;
	local (*FILE);

	open(FILE,$file) || die "$file: $!\n";

	local(@fstat) = stat(FILE);
	local ($size) = $fstat[7] - 8;	# total data size in bytes

	local($hdr,$tmr) = ("","");
	read(FILE,$hdr,4);
	seek(FILE,-4,2);
	read(FILE,$tmr,4);
	close(FILE);

	# this part checks for self-consistency of Fortran unformatted file
	($hdr eq $tmr) || die "$file: not a Fortran unformatted data file.\n";

	local ($ori) = unpack("I",$hdr);
	local ($rev) = unpack("I",join("",reverse(split(//,$hdr))));

	($ori != $size && $rev != $size) &&
		return "unknown";

	($ori == $size && $rev == $size) &&
		return "undecidable";

	local ($opposite) = ($mach_bytesex eq "little-endian") ?
				"big-endian" : "little-endian";

	return ($ori == $size) ? $mach_bytesex : $opposite;

}

#--------------------------------
# check meta info for one dataset
#--------------------------------

sub check_meta {

	local ($ds,$dir) = @_;
	local ($fmeta) = "$dir/$ds.meta";

	#~~~~~~~~~~~~~~~~
	# check meta info
	#~~~~~~~~~~~~~~~~

	undef $/;		# read to the end of file
	open(MFILE,"<$fmeta") || die "$fmeta: $!\n";
	$_=<MFILE>;
	close(MFILE);
	$/ = "\n";		# never mess up
	
	s/\([^)]*\)//g;         #rm (.*)
	s/\/\/[^\n]*\n//g;      #rm comment lines
	s/\/\*.*\*\///g;        #rm inline comments
	s/\s+//g;               #rm white spaces
	/nDims=\[(.+)\];dimList=\[(.+)\];format=\['(.+)'\];nrecords=\[(.+)\];timeStepNumber=\[(.+)\];/
		|| die "$fmeta: meta file format error\n";
	local ($nDims_,$dimList_,$format_,$nrecords_,$timeStepNumber_) = ($1,$2,$3,$4,$5);

	# check Identifier
	(defined $timeStepNumber) || ($timeStepNumber = $timeStepNumber_);
	($timeStepNumber eq $timeStepNumber_) ||
		die "$fmeta: timeStepNumber $timeStepNumber_ inconsistent with other dataset\n";

	# check Number of dimensions
	(defined $nDims) || ($nDims = $nDims_);
	($nDims eq $nDims_) ||
		die "$fmeta: nDims $nDims_ inconsistent with other dataset\n";

	# check Field format
	(defined $format) || ($format = $format_);
	($format eq $format_) ||
		die "$fmeta: format $format_ inconsistent with other dataset\n";

	# check dimList
	# calc dimesions and leading index of this subset
	local (@dimList_) = split(/,/,$dimList_);

 	($nDims_*3 == $#dimList_+1) ||
		die "$fmeta: nDims and dimList conflicting\n";
	
	local (@Dim,@dim,@Index0) = ();
	for (local($i)=0;$i<$nDims_;$i++) {
		push(@Dim,$dimList_[$i*3]);
		push(@dim,$dimList_[$i*3+2]-$dimList_[$i*3+1]+1);
		push(@Index0,$dimList_[$i*3+1]-1);
	}
	local ($Dim_) = join(",",@Dim);
	local ($dim_) = join(",",@dim);

	(defined $Dim) || ($Dim = $Dim_);
	($Dim eq $Dim_) ||
		die "$fmeta: dimList Global inconsistent with other dataset\n";

	(defined $dim) || ($dim = $dim_);
	($dim eq $dim_) ||
		die "$fmeta: dimList Local inconsistent with other dataset\n";

	$ds_Index0{$ds} = join(",", @Index0);

#	print STDOUT "Okay $fmeta\n";
}

#-------------------------------
# check completeness of datasets
# need to be more sophisticated
#-------------------------------
sub check_entirety {

	local (*Dim,*dim,*ds_Index0) = @_;

	local ($N) = &listprod(@Dim);
	local ($n) = &listprod(@dim);
	($N) || return 0;		# against null dimension
	($n) || return 0;		# against null dimension
	($N%$n) && return 0;		# $N/$n must be a whole number

	local (@ds) = keys %ds_Index0;
	($#ds+1 == $N/$n) || return 0;	# Num of datasets must match subdomain

	1;
}

#------------------
# merge one dataset
# assume @Dim, @dim and $bytes existing
# assume $Byte_Reorder existing
#------------------
sub merge_data {

	local ($ds,$dir,*Index0) = @_;
	local ($fdata) = "$dir/$ds.data";

	# data size of one subset in bytes as told by meta info
	local ($size) = &listprod(@dim) * $bytes;

	open(DFILE, "<$fdata") || die "$fdata: $!\n";

	local ($raw) = "";
#aja	sysread(DFILE,$raw,4);
	# Swap header if bytesex is diff from machine's
	local ($hdr);
	if ($Byte_Reorder) {
		$hdr = unpack("I",join("",reverse(split(//,$raw))));
	} else {
		$hdr = unpack("I",$raw);
	}

#aja	($size == $hdr) ||
#aja		die "$fdata: $hdr bytes inconsistent with meta info\n";

	print STDOUT "$ds.data: $size bytes, okay, ";

#	seek(DFILE,4,0);	# rewind back to the beginning of data

	local ($data) = "";		# old perl (< 4.0) needs this to 
	sysread(DFILE,$data,$size);	# avoid warning by sysread() 
	local ($len_chunk) = $dim[0] * $bytes;
	local ($num_chunk) = $size/$len_chunk;

	local ($pos,@index,$Pos,@Index);
	for (local($i)=0;$i<$num_chunk;$i++) {
		$pos = $i * $dim[0];
		@index = &pos2index($pos,@dim);
		@Index = &lists_add(*index,*Index0);
		$Pos = &index2pos(*Index,*Dim);
#aja		seek(FILE,$Pos*$bytes+4,0);
		seek(FILE,$Pos*$bytes,0);
		syswrite(FILE,$data,$len_chunk,$pos*$bytes);
	}

	close(DFILE);

	print STDOUT "merged from $dir\n";
}

#============
# main script
#============

#------------
# parse @ARGV
#............

($#ARGV >= 1) || &usage();

undef @dirs;
while (1) {
	$x = shift(@ARGV);
	unless ($x =~ /^-D(.+)$/) {
		unshift(@ARGV,$x);
		last;
	}
	push(@dirs,$1);
}
(@dirs) || push(@dirs,".");
# @dirs is not empty after this line.
#print STDOUT join(" ",@dirs), "\n";

($#ARGV >= 1) || &usage();

# data set prefix and suffix
$pref = shift(@ARGV);
$suff = shift(@ARGV);

($#ARGV >= 1) && &usage();
undef $forced_bytesex;
if (@ARGV) {
	$forced_bytesex = shift(@ARGV);
	$forced_bytesex =~ /^(little|big)-endian$/ || &usage();
}
#print STDOUT $forced_bytesex, "\n";

#--------------------------
# obtain a list of datasets
#..........................

# %ds_dir is a hash to store the directory that a dataset is in.
# After this step, it is assured that, for a dataset $ds,
# both $ds.meta and $ds.data exist in a unique dir $ds_dir{$ds}.

%ds_dir = ();
foreach $dir (@dirs) {
	opendir(DIR, $dir) || die "$dir: $!\n";
	@fmeta = grep(/^$pref\.$suff\.\d+\.\d+\.meta$/, readdir(DIR));
	closedir(DIR);
	foreach $fmeta (@fmeta) {
		$ds = $fmeta; $ds =~ s/\.meta$//g;
		(defined $ds_dir{$ds}) &&
			die "$fmeta appears in two dirs: $ds_dir{$ds} & $dir\n";
		(-f "$dir/$ds.data") || die "In $dir, $ds.data missing\n";
		$ds_dir{$ds} = $dir;
	}
}

@ds = sort(keys %ds_dir);	 # list of datasets
(@ds) || die "No dataset found.\n";
print STDOUT "There are ", $#ds+1, " datasets.\n";

#---------------------------------
# check meta info for all datasets
#.................................

undef $timeStepNumber;
undef $nDims;
undef $format;

undef $Dim;
undef $dim;
undef %ds_Index0;

#..............................................
# check each meta file and set some global vars

foreach $ds (@ds) {
	&check_meta($ds,$ds_dir{$ds});
}
print STDOUT "All existing meta files are self- and mutually consistent.\n";

#print join(" ",$timeStepNumber,$nDims,$format,$Dim,$dim), "\n";
#foreach $ds (@ds) {
#	$dir = $ds_dir{$ds};
#	$Index0 = $ds_Index0{$ds};
#	print "$ds\n";
#	print "$Index0\n";
#}

@Dim = split(/,/,$Dim);
@dim = split(/,/,$dim);

#................................
# check meta info in its entirety

&check_entirety(*Dim,*dim,*ds_Index0) ||
	die "Datasets are not complete!\n";

print STDOUT "Datasets are complete.\n";

#...........
# set $bytes

if ($format eq "float32") {
	$bytes = 4;
} elsif ($format eq "float64") {
	$bytes = 8
} else {
	die "format '$format' unknown\n";
}

#---------------------------
# check and merge data files
#...........................

#........................
# check machine's bytesex
# it dies if neither little- nor big-endian.

$Mach_Bytesex = &mach_bytesex();
print STDOUT "Current machine's endianness: $Mach_Bytesex\n";

#...................
# check file bytesex and resolve related issues
#aja undef $File_Bytesex;
#aja foreach $ds (@ds) {
#aja 	$fdata = "$ds.data";
#aja 	$file_bytesex = &file_bytesex($ds_dir{$ds}."/$fdata");
#aja 	($file_bytesex eq "unknown") &&
#aja 		die "$fdata: endianness is neither little- nor big-endian.\n";
#aja 	print STDOUT "$fdata: $file_bytesex\n";
#aja 	unless ($File_Bytesex) {
#aja 		$File_Bytesex = $file_bytesex;
#aja 	} else {
#aja 		($File_Bytesex eq $file_bytesex) ||
#aja 		die "Data files are mutually inconsistent in endianness\n";
#aja 	}
#aja }
$File_Bytesex = 'big-endian';

#------------------
# set $Byte_Reorder, which controls swapping of bytes in
# header and terminator of Fortran unformatted data files.
#aja $Byte_Reorder = 0;
$Byte_Reorder = 0;

# if machine and data file have the same bytesex, no need for swapping
#aja ($File_Bytesex eq $Mach_Bytesex) && ($Byte_Reorder = 0);

# if we can't determine bytesex of data file, need forced one from @ARGV.
if ($File_Bytesex eq "undecidable") {
	# if no forced bytesex available, dies.
	($forced_bytesex) ||
		die "Endianness of data files is undecidable, " .
		"you have to give one at command line.\n";
	($forced_bytesex eq $Mach_Bytesex) && ($Byte_Reorder = 0);
	print STDOUT "Endianness of data files is undecidable.\n";
	print STDOUT "Data file header/tail will be treated as ";
	print STDOUT "$forced_bytesex as you have instructed.\n";
# otherwise
} else {
# give a warining, if swapping is needed.
($Byte_Reorder) &&
	print STDOUT
	"Please note: data files have different bytesex than machine!\n";
}

#................
# merge data sets

$Size = &listprod(@Dim) * $bytes;

$fout = "$pref.$suff.data";

open(FILE, ">$fout") || die "$fout: $!\n";

# prepare header and teminator. Do byte reordering if necessary
$HdrTmr = pack("I",$Size);
($Byte_Reorder) && ($HdrTmr = join("",reverse(split(//,$HdrTmr))));

# write 4 byte header
#aja syswrite(FILE,$HdrTmr,4);

# merge each dataset
foreach $ds (@ds) {
	$dir = $ds_dir{$ds};
	@Index0 = split(/,/,$ds_Index0{$ds});
	&merge_data($ds,$dir,*Index0);
}

# write 4 byte terminator
#aja seek(FILE,$Size+4,0);
#aja syswrite(FILE,$HdrTmr,4);

close(FILE);

print STDOUT "Global data (" .
	join("x",@Dim) .
	") is in ./$fout (endianness is $File_Bytesex).\n";

exit 0;
