#ifndef __MATRIX_H__
#define __MATRIX_H__

#include "Matrix.hxx"

#include <exception>
#include <new>
#include <cstring>
#include <algorithm> //for min<T>(), max<T>(), swap<T>()
using namespace std;

#ifdef USE_ALTIVEC
#include "Altivec.h"
#endif

template<typename T>
inline void Matrix<T>::memorySet(T* data, size_t size, const T& fill)
{
  if (binaryMode && (sizeof(T)<=2*sizeof(char)))
  {
    if (sizeof(T)<=sizeof(char))
      memset(data, *((char*) &fill), size*sizeof(T));
    else if (sizeof(T) == 2*sizeof(char))
    {
      char tab[2] = {0};
      T* pTab = (T*)(&tab[0]);
      *pTab = fill;
      if (tab[0] == tab[1])
        memset(data, tab[0], size*sizeof(T));
      else
      {
        for(size_t i=0 ; i<size ; ++i)
          *data++ = fill;
      }
    }
  }//end if binary
  else
  {
    for(size_t i=0 ; i<size ; ++i)
      *data++ = fill;
  }
}
//end memorySet()

template<typename T>
inline void Matrix<T>::memoryCopy(T* dest, const T* src, size_t size)
{
  if (binaryMode)
    memcpy(dest,src,size*sizeof(T));
  else
  {
    for(size_t i=0 ; i<size ; ++i)
      *dest++ = *src++;
  }
}
//end memoryCopy()

template<typename T>
inline void Matrix<T>::memoryMove(T* dest, const T* src, size_t size)
{
  if (binaryMode)
    memmove(dest,src,size*sizeof(T));
  else if (size)
  {
    if (dest < src)
    {
      for(size_t i=0 ; i<size ; ++i)
        *dest++ = *src++;
    }
    else if (dest > src)
    {
      const T* endSrc  = src+size-1;
      T*       endDest = dest+size-1;
      for(size_t i=0 ; i<size ; ++i)
        *endDest-- = *endSrc--;
    }
  }//end if binary
}
//end memoryMove()

template<typename T>
inline T** Matrix<T>::allocation(size_t height, size_t width, const T& fill)
{
  //allocation
  T** newData = 0;
  if (height && width)
  {
    newData = new T*[height];
    newData[0]  = new(nothrow) T[height*width];
    if (newData[0])
    {
      //fills
      memorySet(newData[0],height*width,fill);

      //links rows to data
      for(size_t i=1,col=width ; i<height ; ++i, col+=width)
        newData[i] = &newData[0][col];
    }//end if alloc ok
    else
    {
      delete [] newData;
      newData = 0;
      throw bad_alloc();
    }
  }//end if (height && width)
  return newData;
}
//end allocation(height,width)

template<typename T>
inline void Matrix<T>::clean(void)
{
  height = 0;
  width  = 0;
  if (data)
  {
    if (data[0]) delete [] data[0];
    delete [] data;
    data = 0;
  }
}
//end clean()

template<typename T>
inline void Matrix<T>::copy(const Matrix<T>& other)
{
  height = other.height;
  width  = other.width;
  if (data && other.data) memoryCopy(data[0],other.data[0],height*width);
  binaryMode = other.binaryMode;
}
//end copy<T>()

template<typename T> template<typename U>
inline void Matrix<T>::copy(const Matrix<U>& other)
{
  height = other.getHeight();
  width  = other.getWidth();
  if (data && other.getData())
  {
    T* pDataDest      = data[0];
    const U* pDataSrc = other.getData()[0];
    for(size_t i=0,size=height*width; i<size ; ++i)
      *pDataDest++ = static_cast<T>(*pDataSrc++);
  }
  binaryMode = other.getBinaryMode();
}
//end copy<U>()

template<typename T>
inline void Matrix<T>::copy(const Matrix<T>& other,size_t fromRow, size_t fromCol,
		                            size_t aHeight, size_t aWidth,
					    bool loop)
{
  height = aHeight;
  width  = aWidth;
  if (data && other.data)
  {
    const size_t nbRowsInsideMatrix  = (other.height>fromRow) ?
		                                           std::min(aHeight,other.height-fromRow) : 0;
    const size_t nbRowsOutsideMatrix = (fromRow+aHeight<other.height) ? 0 
	                                     : fromRow+aHeight-other.height;
    const size_t nbColsInsideMatrix  = (other.width>fromCol) ?
		                                           std::min(aWidth,other.width-fromCol) : 0;
    const size_t nbColsOutsideMatrix = (fromCol+aWidth<other.width) ? 0 
	                                     : fromCol+aWidth-other.width;
    if (!loop || !nbColsOutsideMatrix)
    {
      for(size_t i=0 ; i<nbRowsInsideMatrix ; ++i)
        memoryCopy(data[i],other.data[fromRow+i]+fromCol,nbColsInsideMatrix);
      if (loop && nbRowsOutsideMatrix)
      {
        for(size_t i=0 ; i<nbRowsOutsideMatrix ; ++i)
          memoryCopy(data[nbRowsInsideMatrix+i],
		     other.data[(fromRow+i)%other.height]+fromCol,nbColsInsideMatrix);
      }//end if there is something to do
    }//end if (!loop || !nbColsOutsideMatrix)
    else if (loop && nbColsOutsideMatrix)
    {
      for(size_t i=0 ; i<nbRowsInsideMatrix ; ++i)
      {
        memoryCopy(data[i],other.data[fromRow+i]+fromCol,nbColsInsideMatrix);
	const size_t nbFullWidth = nbColsOutsideMatrix/other.width;
	const size_t nbPartWidth = nbColsOutsideMatrix%other.width;
	for(size_t j=0 ; j<nbFullWidth ; ++j)
          memoryCopy(data[i]+nbColsInsideMatrix+j*other.width,other.data[fromRow+i],other.width);
	if (nbPartWidth)
          memoryCopy(data[i]+nbColsInsideMatrix+nbFullWidth*other.width,
		 other.data[fromRow+i],nbPartWidth);
      }//end rowswithin
      if (loop && nbRowsOutsideMatrix)
      {
        for(size_t i=0 ; i<nbRowsOutsideMatrix ; ++i)
	{
          memoryCopy(data[nbRowsInsideMatrix+i],
                     other.data[i%other.height]+fromCol,nbColsInsideMatrix);
	  const size_t nbFullWidth = nbColsOutsideMatrix/other.width;
	  const size_t nbPartWidth = nbColsOutsideMatrix%other.width;
	  for(size_t j=0 ; j<nbFullWidth ; ++j)
            memoryCopy(data[nbRowsInsideMatrix+i]+nbColsInsideMatrix+j*other.width,
                       other.data[i%other.height],other.width);
          if (nbPartWidth)
            memoryCopy(data[nbRowsInsideMatrix+i]+nbColsInsideMatrix+nbFullWidth*other.width,
	               other.data[i%other.height],nbPartWidth);
	}
      }
    }//end if (loop && nbColsOutsideMatrix)
    else
    {
      for(size_t i=0 ; i<nbRowsInsideMatrix ; ++i)
        memoryCopy(data[i],other.data[fromRow+i]+fromCol,nbColsInsideMatrix);
    }
  }//end if (data && other.data)
  binaryMode = other.binaryMode;
}
//end copy<T>(fromRow,fromCol,width,height)

template<typename T> template<typename U>
inline void Matrix<T>::copy(const Matrix<U>& other,size_t fromRow, size_t fromCol,
		                            size_t aHeight, size_t aWidth,
					    bool loop)
{
  height = aHeight;
  width  = aWidth;
  if (data && other.getData())
  {
    if (!loop)
    {
      T* pDataDest = 0;
      const size_t lastRow = std::min(other.getHeight(),fromRow+height);
      const size_t lastCol = std::min(other.getWidth() ,fromCol+width);
      for(size_t r=fromRow ; r<lastRow ; ++r)
      {
        pDataDest=data[r-fromRow];
        const U* pDataSrc = other.getData()[r]+fromCol;
        for(size_t c=fromCol ; c<lastCol ; ++c)
          *pDataDest++ = static_cast<T>(*pDataSrc++);
      }
    }//end if !loop
    else //if loop
    {
      T* pDataDest = data[0];
      for(size_t h=0 ; h<height ; ++h)
      {
	for(size_t w=0 ; w<width ; ++w)
          *pDataDest++ = static_cast<T>(other[(fromRow+h)%other.getHeight()]
			                     [(fromCol+w)%other.getWidth()]);
      }
    }//end if loop
  }//end if (data && other.data)
  binaryMode = other.getBinaryMode();
}
//end copy<U>(fromRow,fromCol,width,height)

template<typename T>
inline Matrix<T>& Matrix<T>::resize(size_t newHeight, size_t newWidth,
		             size_t topMargin, size_t leftMargin, const T& fill)
{
  if ((newHeight!=height) || (newWidth != width) || topMargin || leftMargin)
  {
    T** newData = allocation(newHeight,newWidth,fill);
  
    const size_t fromRow = (newHeight>height) ? 0 : topMargin;
    const size_t fromCol = (newWidth >width ) ? 0 : leftMargin;
    const size_t toRow   = std::min(fromRow+newHeight, height);
    const size_t toCol   = std::min(fromCol+newWidth , width);
    const size_t targetFromRow = (newHeight>height) ? topMargin  : 0;
    const size_t targetFromCol = (newWidth >width ) ? leftMargin : 0;

    if ((fromCol<width) && (toCol>fromCol) && (targetFromCol<newWidth))
    {
      for(size_t i=fromRow ; (i<toRow) && (targetFromRow+i-fromRow<newHeight) ; ++i)
        memoryCopy(newData[targetFromRow+i-fromRow]+targetFromCol,data[i]+fromCol,
	           toCol-fromCol);
    }
  
    clean();
  
    height = newHeight;
    width  = newWidth;
    data   = newData;
  }
  
  return *this;
}
//end resize()

template<typename T>
inline Matrix<T>& Matrix<T>::transpose(void)
{
  if (!height || !width || (height == width))
  {
    T temp;
    for(size_t h=1 ; h<height; ++h)
    {
      for(size_t w=0 ; (w<width) && (w<h) ; ++w)
      {
        temp       = data[h][w];
        data[h][w] = data[w][h];
        data[w][h] = temp;
      }
    }
  }
  else // height != width
  {
    T** newData = allocation(width,height);
    const T* pDataSrc = data[0];
    for(size_t i=0,size=height*width ; i<size ; ++i)
      newData[i%width][i/width] = *pDataSrc++;;

    const size_t height_sav = height;
    const size_t width_sav  = width;
    clean();
    height = width_sav;
    width  = height_sav;
    data = newData;
  }
  return *this;
}
//end transpose()

template<typename T>
inline Matrix<T>& Matrix<T>::shift(int deltaHeight, int deltaWidth, const T& fill)
{
  if (data)
  {
    if (    ((deltaHeight>0) && (static_cast<size_t>(deltaHeight)>=height))
         || ((deltaHeight<0) && (static_cast<size_t>(-deltaHeight)>=height))
         || ((deltaWidth>0) && (static_cast<size_t>(deltaWidth)>=width))
         || ((deltaWidth<0) && (static_cast<size_t>(-deltaWidth)>=width))
       )//nothing to do
    {
      memorySet(data[0],height*width,fill);
    }
    else //the matrix won't be full of "fill"
    {
      if (deltaHeight>0)
      {
        memoryMove(data[deltaHeight],data[0],(height-deltaHeight)*width);
	memorySet(data[0],deltaHeight*width,fill);
	if (deltaWidth>0)
	{
	  for(size_t h=static_cast<size_t>(deltaHeight),
	      length = width-deltaWidth; h<height ; ++h)
	  {
	    memoryMove(data[h]+deltaWidth,data[h],length);
	    memorySet(data[h],deltaWidth,fill);
	  }
	}
	else if (deltaWidth<0)
	{
	  deltaWidth = -deltaWidth;
	  for(size_t h=static_cast<size_t>(deltaHeight),
	      length = width-deltaWidth; h<height ; ++h)
	  {
	    memoryMove(data[h],data[h]+deltaWidth,length);
	    memorySet(data[h]+width-deltaWidth,deltaWidth,fill);
	  }
	}
      }//end shift with deltaHeight>0
      else if (deltaHeight<0)
      {
	deltaHeight = -deltaHeight;
        memoryMove(data[0],data[deltaHeight],(height-deltaHeight)*width);
	memorySet(data[height-deltaHeight],deltaHeight*width,fill);
	if (deltaWidth>0)
	{
	  for(size_t h=0, length = width-deltaWidth;		  
	      h+static_cast<size_t>(deltaHeight)<height ; ++h)
	  {
	    memoryMove(data[h]+deltaWidth,data[h],length);
	    memorySet(data[h],deltaWidth,fill);
	  }
	}
	else if (deltaWidth<0)
	{
	  deltaWidth = -deltaWidth;
	  for(size_t h=0, length = width-deltaWidth;
	      h+static_cast<size_t>(deltaHeight)<height ; ++h)
	  {
	    memoryMove(data[h],data[h]+deltaWidth,length);
	    memorySet(data[h]+width-deltaWidth,deltaWidth,fill);
	  }
	}
      }//end shift with deltaHeight<0
    }
  }//end if (data)
	  
  return *this;
}
//end shift()

template<typename T>
inline Matrix<T>& Matrix<T>::loop(int deltaHeight, int deltaWidth)
{
  if (data)
  {
    deltaWidth %= width;
    if (deltaWidth>0)
    {
      const size_t marginLength = deltaWidth*sizeof(T);
      char* buffer = new(nothrow) char[marginLength/sizeof(char)]; //avoid construction of T objects
      if (buffer)
      {
        for(size_t h=0 ; h<height ; ++h)
        {
          memoryCopy((T*) buffer,data[h]+width-deltaWidth,deltaWidth);
	  memoryMove(data[h]+deltaWidth,data[h],width-deltaWidth);
	  memoryCopy(data[h],(T*) buffer,deltaWidth);
        }
        delete [] buffer;
      }
      else //no memory to optimize
      {
        for(size_t h=0 ; h<height ; ++h)
        {
          for(int i=0 ; i<deltaWidth ; ++i)
	  {
            T* row = data[h];
	    T temp = row[width-1];
            for(size_t c=width-1 ; c>0 ; --c)
              row[c] = row[c-1];
	    row[0] = temp;
	  }
	}
      }//end if cannot optimize
    }//end loop with deltawith>0
    else if (deltaWidth<0)
    {
      deltaWidth = -deltaWidth;
      const size_t marginLength = deltaWidth*sizeof(T);
      char* buffer = new(nothrow) char[marginLength/sizeof(char)]; //avoid construction of T objects
      if (buffer)
      {
        for(size_t h=0 ; h<height ; ++h)
        {
          memoryCopy((T*) buffer,data[h],deltaWidth);
	  memoryMove(data[h],data[h]+deltaWidth,width-deltaWidth);
	  memoryCopy(data[h]+width-deltaWidth,(T*) buffer,deltaWidth);
        }
        delete [] buffer;
      }
      else //if no memory to optimize
      {
        for(size_t h=0 ; h<height ; ++h)
        {
          for(int i=0 ; i<deltaWidth ; ++i)
          {
            T* row = data[h];
	    T temp = row[0];
            for(size_t c=0 ; c+1<width ; ++c)
              row[c] = row[c+1];
	    row[width-1] = temp;
	  }
	}
      }
    }//end loop with deltaWidth<0
    
    deltaHeight %= height;
    if (deltaHeight>0)
    {
      const size_t marginLength = deltaHeight*width*sizeof(T);
      char* buffer = new(nothrow) char[marginLength/sizeof(char)]; //avoid construction of T objects
      if (buffer)
      {
        memoryCopy((T*) buffer, data[height-deltaHeight], deltaHeight*width);
        memoryMove(data[deltaHeight],data[0],(height-deltaHeight)*width);
        memoryCopy(data[0],(T*) buffer,deltaHeight*width);
        delete [] buffer;

      }
      else//no memory for best optimization, try simpler optimization
      {
	size_t length = width*sizeof(T);
        buffer = new(nothrow) char[length/sizeof(char)];
	if (buffer)
	{
          for(int i=0 ; i<deltaHeight ; ++i)
          {
	    memoryCopy((T*) buffer,data[height-1],width);
	    for(size_t h=height-1 ; h>0 ; --h)
	      memoryCopy(data[h],data[h-1],width);
	    memoryCopy(data[0],(T*) buffer,width);
	  }
	  delete [] buffer;
	}
	else//no memory for optimization at all
	{
	  const size_t size = height*width;
	  for(int i=0 ; i<deltaHeight ; ++i)
	  {
	    T temp = data[height-1][width-1];
	    T* pData = &data[height-1][width-1];
	    for(size_t count = size-1 ; count>0 ; --count,--pData)
	      *pData = *(pData-1);
	    *pData = temp;
	  }
	}//end no optimization
      }//end no best optimization
    }//end loop with deltaHeight>0
    else if (deltaHeight<0)
    {
      deltaHeight = -deltaHeight;
      const size_t marginLength = deltaHeight*width*sizeof(T);
      char* buffer = new(nothrow) char[marginLength/sizeof(char)]; //Avoid construction of T objects
      if (buffer)
      {
        memoryCopy((T*) buffer, data[0], deltaHeight*width);
        memoryMove(data[0],data[deltaHeight],(height-deltaHeight)*width);
        memoryCopy(data[height-deltaHeight],(T*) buffer,deltaHeight*width);
        delete [] buffer;
      }
      else//no memory for best optimization, try simpler optimization
      {
	const size_t length = width*sizeof(T);
        buffer = new(nothrow) char[length/sizeof(char)];
	if (buffer)
	{
          for(int i=0 ; i<deltaHeight ; ++i)
          {
	    memoryCopy((T*) buffer,data[0],width);
	    for(size_t h=0 ; h+1<height ; ++h)
	      memoryCopy(data[h],data[h+1],width);
	    memoryCopy(data[height-1],(T*) buffer,width);
	  }
	  delete [] buffer;
	}
	else//no memory for optimization at all
	{
	  const size_t size = height*width;
	  for(int i=0 ; i<deltaHeight ; ++i)
	  {
	    T temp = data[0][0];
	    T* pData = &data[0][0];
	    for(size_t count = 0 ; count+1<size ; ++count,++pData)
	      *pData = *(pData+1);
	    *pData = temp;
	  }
	}//end no optimization
      }//end no best optimization
    }//end loop with deltaHeight<0
  }
  return *this;
}
//end loop()

template<typename T>
inline Matrix<T>& Matrix<T>::flip(bool bottomUp, bool leftRight)
{
  if (data)
  {
    bool leftRightAlreadyDone = false;
    if ((bottomUp) && (height>1))
    {
      //avoid construction of T objects
      char* buffer = new(nothrow) char[width*sizeof(T)/sizeof(char)];
      if (buffer)
      {
        for(size_t h=0, limit=height/2 ; h<limit ; ++h)
        {
          memoryCopy((T*) buffer, data[h], width);
	  memoryCopy(data[h], data[height-1-h], width);
	  memoryCopy(data[height-1-h], (T*) buffer, width);
        }
        delete [] buffer;
      }
      else//no memory for optimization
      {
	if (!leftRight)
	{
          for(size_t h=0, limit=height/2 ; h<limit ; ++h)
	  {
	    for(size_t c=0 ; c<width ; ++c)
	      std::swap(data[h][c],data[height-1-h][c]);
          }
	}
	else //if (leftRight) : we can optimize in a single-pass
	{
          for(size_t h=0, limit=height/2 ; h<limit ; ++h)
	  {
	    for(size_t c=0 ; c<width ; ++c)
	      std::swap(data[h][c],data[height-1-h][width-1-c]);
          }
	  leftRightAlreadyDone =true;
	}
      }//end flip rows
    }
    if (leftRight && !leftRightAlreadyDone && (width>1))
    {
      for(size_t h=0 ; h<height ; ++h)
      {
        T* row = data[h];
        for(size_t c = 0, limit = width/2 ; c<limit ; ++c)
          std::swap(row[c],row[width-1-c]);
      }
    }
  }//end if data
  return *this;
}
//end flip()

template<typename T> template<typename U>
inline Matrix<T>::operator Matrix<U>() const
{
  Matrix<U> converted(height,width);
  if (data && converted.getData())
  {
    const T* pDataSrc = data[0];
    U* pDataDest = converted.getData()[0];
    for(size_t i=0,size=height*width ; i<size ; ++i)
     *pDataDest++ = static_cast<U>(*pDataSrc++);
  }
  return converted;
}
//end operator Matrix<U>()

#ifdef USE_ALTIVEC
template<>
inline Matrix<float>& Matrix<float>::operator+=(const Matrix<float>& other)
{
  if (data && other.data)
  {
    const size_t minHeight = std::min(height,other.height);
    if (width == other.width)
      Altivec::vadd(this->data[0],other.data[0],this->data[0],minHeight*width);
    else
    {
      const size_t minWidth = std::min(width,other.width);
      for(size_t h=0 ; h<minHeight ; ++h)
        Altivec::vadd(this->data[h],other.data[h],this->data[h],minWidth);
    }
  }//end if data && other.data
  return *this;
}
//end operator+=(Matrix)
#endif

template<typename T> template<typename U>
inline Matrix<T>& Matrix<T>::operator+=(const Matrix<U>& other)
{
  const U** data2 = const_cast<const U**>(other.getData());
  if (data && data2)
  {
    const size_t height2 = other.getHeight();
    const size_t width2  = other.getWidth();
    if (width == width2)
    {
      T* pDataDest      = data[0];
      const U* pDataSrc = data2[0];
      for(size_t i=0,size=width*std::min(height,height2) ; i<size ; ++i)
        *pDataDest++ += static_cast<T>(*pDataSrc++);
    }
    else// if (width != width2)
    {
      const size_t minHeight = std::min(height, height2);
      const size_t minWidth  = std::min(width , width2);
      T* pDataDest      = 0;
      const U* pDataSrc = 0;
      for(size_t h=0 ; h<minHeight ; ++h)
      {
        pDataDest = data[h];
        pDataSrc  = data2[h];
        for(size_t w=0 ; w<minWidth ; ++w)
          *pDataDest++ += static_cast<T>(*pDataSrc++);
      }
    }//end if (width != width2)
  }//end if data && data2
  return *this;
}
//end operator+=(Matrix)

template<typename T> template<typename U>
inline Matrix<T>& Matrix<T>::operator+=(const U& x)
{
  if (data)
  {
    T* pDataDest = data[0];
    for(size_t i=0,size=height*width ; i<size ; ++i)
      *pDataDest++ += static_cast<T>(x);
  }
  return *this;
}
//end operator+=(x)

#ifdef USE_ALTIVEC
template<>
inline Matrix<float>& Matrix<float>::operator-=(const Matrix<float>& other)
{
  if (data && other.data)
  {
    const size_t minHeight = std::min(height,other.height);
    if (width == other.width)
      Altivec::vsub(this->data[0],other.data[0],this->data[0],minHeight*width);
    else
    {
      const size_t minWidth = std::min(width,other.width);
      for(size_t h=0 ; h<minHeight ; ++h)
        Altivec::vsub(this->data[h],other.data[h],this->data[h],minWidth);
    }
  }//end if data && other.data
  return *this;
}
//end operator-=(Matrix)
#endif

template<typename T> template<typename U>
inline Matrix<T>& Matrix<T>::operator-=(const Matrix<U>& other)
{
  const U** data2 = const_cast<const U**>(other.getData());
  if (data && data2)
  {
    const size_t height2 = other.getHeight();
    const size_t width2  = other.getWidth();
    if (width == width2)
    {
      T* pDataDest      = data[0];
      const U* pDataSrc = data2[0];
      for(size_t i=0,size=width*std::min(height,height2) ; i<size ; ++i)
        *pDataDest++ -= static_cast<T>(*pDataSrc++);
    }
    else// if (width != width2)
    {
      const size_t minHeight = std::min(height, height2);
      const size_t minWidth  = std::min(width , width2);
      T* pDataDest      = 0;
      const U* pDataSrc = 0;
      for(size_t h=0 ; h<minHeight ; ++h)
      {
        pDataDest = data[h];
        pDataSrc  = data2[h];
        for(size_t w=0 ; w<minWidth ; ++w)
          *pDataDest++ -= static_cast<T>(*pDataSrc++);
      }
    }
  }//end if (data && other.data)
  return *this;
}
//end operator-=(Matrix)

template<typename T> template<typename U>
inline Matrix<T>& Matrix<T>::operator-=(const U& x)
{
  if (data)
  {
    T* pDataDest = data[0];
    for(size_t i=0,size=height*width ; i<size ; ++i)
      *pDataDest++ -= static_cast<T>(x);
  }
  return *this;
}
//end operator-=(x)

#ifdef USE_ALTIVEC
template<>
inline Matrix<float>& Matrix<float>::operator*=(const Matrix<float>& other)
{
  if (data && other.data)
  {
    const size_t minHeight = std::min(height,other.height);
    if (width == other.width)
      Altivec::vmul(this->data[0],other.data[0],this->data[0],minHeight*width);
    else
    {
      const size_t minWidth = std::min(width,other.width);
      for(size_t h=0 ; h<minHeight ; ++h)
        Altivec::vmul(this->data[h],other.data[h],this->data[h],minWidth);
    }
  }//end if data && other.data
  return *this;
}
//end operator*=(Matrix)
#endif

template<typename T> template<typename U>
inline Matrix<T>& Matrix<T>::operator*=(const Matrix<U>& other)
{
  const U** data2 = const_cast<const U**>(other.getData());
  if (data && data2)
  {
    const size_t height2 = other.getHeight();
    const size_t width2  = other.getWidth();
    if (width == width2)
    {
      T* pDataDest      = data[0];
      const U* pDataSrc = data2[0];
      for(size_t i=0,size=width*std::min(height,height2) ; i<size ; ++i)
        *pDataDest++ *= static_cast<T>(*pDataSrc++);
    }
    else// if (width != width2)
    {
      const size_t minHeight = std::min(height, height2);
      const size_t minWidth  = std::min(width , width2);
      T* pDataDest      = 0;
      const U* pDataSrc = 0;
      for(size_t h=0 ; h<minHeight ; ++h)
      {
        pDataDest = data[h];
        pDataSrc  = data2[h];
        for(size_t w=0 ; w<minWidth ; ++w)
          *pDataDest++ *= static_cast<T>(*pDataSrc++);
      }
    }
  }//end if (data && data2)
  return *this;
}
//end operator*=(Matrix)

#ifdef USE_ALTIVEC
template<> inline Matrix<float>& Matrix<float>::operator*=(const bool& x)
{
  if (data) Altivec::vmul(this->data[0],static_cast<float>(x),this->data[0],height*width);
  return *this;
}
template<> inline Matrix<float>& Matrix<float>::operator*=(const char& x)
{
  if (data) Altivec::vmul(this->data[0],static_cast<float>(x),this->data[0],height*width);
  return *this;
}
template<> inline Matrix<float>& Matrix<float>::operator*=(const unsigned char& x)
{
  if (data) Altivec::vmul(this->data[0],static_cast<float>(x),this->data[0],height*width);
  return *this;
}
template<> inline Matrix<float>& Matrix<float>::operator*=(const short& x)
{
  if (data) Altivec::vmul(this->data[0],static_cast<float>(x),this->data[0],height*width);
  return *this;
}
template<> inline Matrix<float>& Matrix<float>::operator*=(const unsigned short& x)
{
  if (data) Altivec::vmul(this->data[0],static_cast<float>(x),this->data[0],height*width);
  return *this;
}
template<> inline Matrix<float>& Matrix<float>::operator*=(const int& x)
{
  if (data) Altivec::vmul(this->data[0],static_cast<float>(x),this->data[0],height*width);
  return *this;
}
template<> inline Matrix<float>& Matrix<float>::operator*=(const unsigned int& x)
{
  if (data) Altivec::vmul(this->data[0],static_cast<float>(x),this->data[0],height*width);
  return *this;
}
template<> inline Matrix<float>& Matrix<float>::operator*=(const long& x)
{
  if (data) Altivec::vmul(this->data[0],static_cast<float>(x),this->data[0],height*width);
  return *this;
}
template<> inline Matrix<float>& Matrix<float>::operator*=(const unsigned long& x)
{
  if (data) Altivec::vmul(this->data[0],static_cast<float>(x),this->data[0],height*width);
  return *this;
}
template<> inline Matrix<float>& Matrix<float>::operator*=(const float& x)
{
  if (data) Altivec::vmul(this->data[0],static_cast<float>(x),this->data[0],height*width);
  return *this;
}
template<> inline Matrix<float>& Matrix<float>::operator*=(const double& x)
{
  if (data) Altivec::vmul(this->data[0],static_cast<float>(x),this->data[0],height*width);
  return *this;
}
//end operator*=(Matrix)
#endif

template<typename T> template<typename U>
inline Matrix<T>& Matrix<T>::operator*=(const U& x)
{
  if (data)
  {
    T* pDataDest = data[0];
    for(size_t i=0,size=height*width ; i<size ; ++i)
      *pDataDest++ *= static_cast<T>(x);
  }
  return *this;
}
//end operator*=(x)

template<typename T> template<typename U>
inline Matrix<T>& Matrix<T>::operator/=(const Matrix<U>& other)
{
  const U** data2 = const_cast<const U**>(other.getData());
  if (data && data2)
  {
    const size_t height2 = other.getHeight();
    const size_t width2  = other.getWidth();
    if (width == width2)
    {
      T* pDataDest      = data[0];
      const U* pDataSrc = data2[0];
      for(size_t i=0,size=width*std::min(height,height2) ; i<size ; ++i)
        *pDataDest++ /= static_cast<T>(*pDataSrc++);
    }
    else// if (width != width2)
    {
      const size_t minHeight = std::min(height, height);
      const size_t minWidth  = std::min(width , width);
      T* pDataDest      = 0;
      const U* pDataSrc = 0;
      for(size_t h=0 ; h<minHeight ; ++h)
      {
        pDataDest = data[h];
        pDataSrc  = data2[h];
        for(size_t w=0 ; w<minWidth ; ++w)
          *pDataDest++ /= static_cast<T>(*pDataSrc++);
      }
    }
  }//end if (data && .data2)
  return *this;
}
//end operator/=(Matrix)

#ifdef USE_ALTIVEC
template<> inline Matrix<float>& Matrix<float>::operator/=(const bool& x)
{
  if (data) Altivec::vdiv(this->data[0],static_cast<float>(x),this->data[0],height*width);
  return *this;
}
template<> inline Matrix<float>& Matrix<float>::operator/=(const char& x)
{
  if (data) Altivec::vdiv(this->data[0],static_cast<float>(x),this->data[0],height*width);
  return *this;
}
template<> inline Matrix<float>& Matrix<float>::operator/=(const unsigned char& x)
{
  if (data) Altivec::vdiv(this->data[0],static_cast<float>(x),this->data[0],height*width);
  return *this;
}
template<> inline Matrix<float>& Matrix<float>::operator/=(const short& x)
{
  if (data) Altivec::vdiv(this->data[0],static_cast<float>(x),this->data[0],height*width);
  return *this;
}
template<> inline Matrix<float>& Matrix<float>::operator/=(const unsigned short& x)
{
  if (data) Altivec::vdiv(this->data[0],static_cast<float>(x),this->data[0],height*width);
  return *this;
}
template<> inline Matrix<float>& Matrix<float>::operator/=(const int& x)
{
  if (data) Altivec::vdiv(this->data[0],static_cast<float>(x),this->data[0],height*width);
  return *this;
}
template<> inline Matrix<float>& Matrix<float>::operator/=(const unsigned int& x)
{
  if (data) Altivec::vdiv(this->data[0],static_cast<float>(x),this->data[0],height*width);
  return *this;
}
template<> inline Matrix<float>& Matrix<float>::operator/=(const long& x)
{
  if (data) Altivec::vdiv(this->data[0],static_cast<float>(x),this->data[0],height*width);
  return *this;
}
template<> inline Matrix<float>& Matrix<float>::operator/=(const unsigned long& x)
{
  if (data) Altivec::vdiv(this->data[0],static_cast<float>(x),this->data[0],height*width);
  return *this;
}
template<> inline Matrix<float>& Matrix<float>::operator/=(const float& x)
{
  if (data) Altivec::vdiv(this->data[0],static_cast<float>(x),this->data[0],height*width);
  return *this;
}
template<> inline Matrix<float>& Matrix<float>::operator/=(const double& x)
{
  if (data) Altivec::vdiv(this->data[0],static_cast<float>(x),this->data[0],height*width);
  return *this;
}
//end operator*=(Matrix)
#endif

template<typename T> template<typename U>
inline Matrix<T>& Matrix<T>::operator/=(const U& x)
{
  if (data)
  {
    T* pDataDest = data[0];
    for(size_t i=0,size=height*width ; i<size ; ++i)
      *pDataDest++ /= static_cast<T>(x);
  }
  return *this;
}
//end operator/=(x)
    
template<typename T> template<typename U>
inline Matrix<T>& Matrix<T>::operator&=(const Matrix<U>& other)
{
  const U** data2 = const_cast<const U**>(other.getData());
  if (data && data2)
  {
    const size_t height2 = other.getHeight();
    const size_t width2  = other.getWidth();
    if (width == width2)
    {
      T* pDataDest      = data[0];
      const U* pDataSrc = data2[0];
      for(size_t i=0,size=width*std::min(height,height2) ; i<size ; ++i)
        *pDataDest++ &= static_cast<T>(*pDataSrc++);
    }
    else// if (width != width2)
    {
      const size_t minHeight = std::min(height, height2);
      const size_t minWidth  = std::min(width , width2);
      T* pDataDest      = 0;
      const U* pDataSrc = 0;
      for(size_t h=0 ; h<minHeight ; ++h)
      {
        pDataDest = data[h];
        pDataSrc  = data2[h];
        for(size_t w=0 ; w<minWidth ; ++w)
          *pDataDest++ &= static_cast<T>(*pDataSrc++);
      }
    }
  }//end if (data && data2)
  return *this;
}
//end operator&=(Matrix)

template<typename T> template<typename U>
inline Matrix<T>& Matrix<T>::operator&=(const U& x)
{
  if (data)
  {
    T* pDataDest = data[0];
    for(size_t i=0,size=height*width ; i<size ; ++i)
      *pDataDest++ &= static_cast<T>(x);
  }
  return *this;
}
//end operator&=(x)

template<typename T> template<typename U>
inline Matrix<T>& Matrix<T>::operator|=(const Matrix<U>& other)
{
  const U** data2 = const_cast<const U**>(other.getData());
  if (data && data2)
  {
    const size_t height2 = other.getHeight();
    const size_t width2  = other.getWidth();
    if (width == width2)
    {
      T* pDataDest      = data[0];
      const U* pDataSrc = data2[0];
      for(size_t i=0,size=width*std::min(height,height2) ; i<size ; ++i)
        *pDataDest++ |= static_cast<T>(*pDataSrc++);
    }
    else// if (width != width2)
    {
      const size_t minHeight = std::min(height, height2);
      const size_t minWidth  = std::min(width , width2);
      T* pDataDest      = 0;
      const U* pDataSrc = 0;
      for(size_t h=0 ; h<minHeight ; ++h)
      {
        pDataDest = data[h];
        pDataSrc  = data2[h];
        for(size_t w=0 ; w<minWidth ; ++w)
          *pDataDest++ |= static_cast<T>(*pDataSrc++);
      }
    }
  }//end if (data && data2)
  return *this;
}
//end operator|=(Matrix)

template<typename T> template<typename U>
inline Matrix<T>& Matrix<T>::operator|=(const U& x)
{
  if (data)
  {
    T* pDataDest = data[0];
    for(size_t i=0,size=height*width ; i<size ; ++i)
      *pDataDest++ |= static_cast<T>(x);
  }
  return *this;
}
//end operator|=(x)

template<typename T> template<typename U>
inline Matrix<T>& Matrix<T>::operator^=(const Matrix<U>& other)
{
  const U** data2 = const_cast<const U**>(other.getData());
  if (data && data2)
  {
    const size_t height2 = other.getHeight();
    const size_t width2  = other.getWidth();
    if (width == width2)
    {
      T* pDataDest      = data[0];
      const U* pDataSrc = data2[0];
      for(size_t i=0,size=width*std::min(height,height2) ; i<size ; ++i)
        *pDataDest++ ^= static_cast<T>(*pDataSrc++);
    }
    else// if (width != width2)
    {
      const size_t minHeight = std::min(height, height2);
      const size_t minWidth  = std::min(width , width2);
      T* pDataDest      = 0;
      const U* pDataSrc = 0;
      for(size_t h=0 ; h<minHeight ; ++h)
      {
        pDataDest = data[h];
        pDataSrc  = data2[h];
        for(size_t w=0 ; w<minWidth ; ++w)
          *pDataDest++ ^= static_cast<T>(*pDataSrc++);
      }
    }
  }//end if (data && data2)
  return *this;
}
//end operator^=(Matrix)

template<typename T> template<typename U>
inline Matrix<T>& Matrix<T>::operator^=(const U& x)
{
  if (data)
  {
    T* pDataDest = data[0];
    for(size_t i=0,size=height*width ; i<size ; ++i)
      *pDataDest++ ^= static_cast<T>(x);
  }
  return *this;
}
//end operator^=(x)

template<>
inline Matrix<bool> Matrix<bool>::operator~()
{
  Matrix<bool> m(height,width);
  if (data)
  {
    bool* pDataDest = m.data[0];
    const bool* pDataSrc = data[0];
    for(size_t i=0,size=height*width; i<size ; ++i)
      *pDataDest++ = !(*pDataSrc++);
  }
  return m;
}
//end operator~(bool)

template<typename T>
inline Matrix<T> Matrix<T>::operator~()
{
  Matrix<T> m(height,width);
  if (data)
  {
    T* pDataDest = m.data[0];
    const T* pDataSrc = data[0];
    for(size_t i=0,size=height*width; i<size ; ++i)
      *pDataDest++ = static_cast<T>(~(*pDataSrc++));
  }
  return m;
}
//end operator~()

template<typename T, typename U>
inline bool operator==(const Matrix<T>& m1, const Matrix<U>& m2)
{
  bool equal = (&m1 == &m2);
  if (!equal)
  {
    equal = ((m1.getHeight()==m2.getHeight()) && (m1.getWidth() == m2.getWidth()));
    if (equal && m1.getData() && m2.getData())
    {
      const T* pData1 = m1.getData()[0];
      const U* pData2 = m2.getData()[0];
      for(size_t i=0,size=m1.getHeight()*m1.getWidth() ; equal && (i<size) ; ++i)
        equal &= (*pData1++ == *pData2++);
    }
  }
  return equal;
}
//end operator==(T,U)

template<typename T, typename U>
inline bool operator<(const Matrix<T>& m1, const Matrix<U>& m2)
{
  bool mayBeLowerThan = !(&m1 == &m2);
  if (mayBeLowerThan)
  {
    mayBeLowerThan = ((m1.getHeight()==m2.getHeight()) && (m1.getWidth() == m2.getWidth()));
    if (mayBeLowerThan)
    {
      if (!m1.getHeight() || !m1.getWidth())
	mayBeLowerThan = false;
      else
      {
        const T* pData1 = m1.getData()[0];
        const U* pData2 = m2.getData()[0];
        for(size_t i=0,size=m1.getHeight()*m1.getWidth();
	    mayBeLowerThan && (i<size) ; ++i)
          mayBeLowerThan &= (*pData1++ < *pData2++);
      }
    }
  }
  return mayBeLowerThan;
}
//end operator<(T,U)

template<typename T, typename U>
inline bool operator<=(const Matrix<T>& m1, const Matrix<U>& m2)
{
  bool mayBeLowerOrEqualThan = !(&m1 == &m2);
  if (mayBeLowerOrEqualThan)
  {
    mayBeLowerOrEqualThan = ((m1.getHeight()==m2.getHeight()) && (m1.getWidth() == m2.getWidth()));
    if (mayBeLowerOrEqualThan && m1.getData() && m2.getData())
    {
      const T* pData1 = m1.getData()[0];
      const U* pData2 = m2.getData()[0];
      for(size_t i=0,size=m1.getHeight()*m1.getWidth();
	  mayBeLowerOrEqualThan && (i<size) ; ++i)
        mayBeLowerOrEqualThan &= (*pData1++ <= *pData2++);
    }
  }
  return mayBeLowerOrEqualThan;
}
//end operator<=(T,U)

template<typename T, typename U>
inline bool operator>(const Matrix<T>& m1, const Matrix<U>& m2)
{
  bool mayBeGreaterThan = !(&m1 == &m2);
  if (mayBeGreaterThan)
  {
    mayBeGreaterThan = ((m1.getHeight()==m2.getHeight()) && (m1.getWidth() == m2.getWidth()));
    if (mayBeGreaterThan)
    {
      if (!m1.getHeight() || !m1.getWidth())
	mayBeGreaterThan = false;
      else
      {
        const T* pData1 = m1.getData()[0];
        const U* pData2 = m2.getData()[0];
        for(size_t i=0,size=m1.getHeight()*m1.getWidth();
	    mayBeGreaterThan && (i<size) ; ++i)
          mayBeGreaterThan &= (*pData1++ > *pData2++);
      }
    }
  }
  return mayBeGreaterThan;
}
//end operator>(T,U)

template<typename T, typename U>
inline bool operator>=(const Matrix<T>& m1, const Matrix<U>& m2)
{
  bool mayBeGreaterOrEqualThan = !(&m1 == &m2);
  if (mayBeGreaterOrEqualThan)
  {
    mayBeGreaterOrEqualThan = ((m1.getHeight()==m2.getHeight()) && (m1.getWidth() == m2.getWidth()));
    if (mayBeGreaterOrEqualThan && m1.getData() && m2.getData())
    {
      const T* pData1 = m1.getData()[0];
      const U* pData2 = m2.getData()[0];
      for(size_t i=0,size=m1.getHeight()*m1.getWidth();
	  mayBeGreaterOrEqualThan && (i<size) ; ++i)
        mayBeGreaterOrEqualThan &= (*pData1++ >= *pData2++);
    }
  }
  return mayBeGreaterOrEqualThan;
}
//end operator>=(T,U)

template<typename T>
inline Matrix<T>& Matrix<T>::fillWith(size_t targetRow, size_t targetCol,
		               const Matrix<T>& other,
			       size_t fromRow, size_t fromCol,
		               size_t height, size_t width, bool loop)
{
  if (data && other.data)
  {
    if (loop)
    {
      targetCol %= this->width;
      fromCol   %= other.width;
      const size_t lastRow = targetRow+height;
      const size_t lastCol = targetCol+width;
      const size_t lengthLeft  = fromCol;
      const size_t lengthRight = other.width-lengthLeft;
      const size_t limit = std::min(this->width,lastCol);
      T* pLine = 0;
      const T* pLineOther = 0;
      for(size_t r=targetRow ; r<lastRow ; ++r)
      {
	pLine = data[r%this->height];
	pLineOther = other.data[(fromRow+r-targetRow)%other.height];
	size_t c = targetCol;
	while(c+other.width<limit)
	{
	  memoryCopy(&pLine[c],&pLineOther[fromCol],lengthRight);
	  c += lengthRight,
	  memoryCopy(&pLine[c],&pLineOther[0],lengthLeft);
	  c += lengthLeft;
	}
        while(c<limit)
	{
          pLine[c] = static_cast<T>(pLineOther[(fromCol+c-targetCol)%other.width]);
	  ++c;
	}
	while(c+other.width<lastCol)
	{
	  memoryCopy(&pLine[c%this->width],&pLineOther[fromCol],lengthRight);
	  c += lengthRight,
	  memoryCopy(&pLine[c%this->width],&pLineOther[0],lengthLeft);
	  c += lengthLeft;
	}
        while(c<lastCol)
	{
          pLine[c%this->width] = static_cast<T>(pLineOther[(fromCol+c-targetCol)%other.width]);
	  ++c;
	}
      }
    } //end if (loop)
    else //if (!loop)
    {
      if ((targetRow<this->height) && (targetCol<this->width) &&
          (fromRow<other.getHeight()) && (fromCol<other.getWidth()))
      {
        const size_t lastRow = std::min(this->height,targetRow+height);
        size_t length  = (targetCol+width <= this->width ) ? width
                               : (this->width<targetCol ? 0 : this->width-targetCol);
        if (fromCol+length>other.getWidth()) length = other.getWidth()-fromCol;
 
        for(size_t r=targetRow ; (r<lastRow) && (fromRow+r-targetRow<other.getHeight()) ; ++r)
          memoryCopy(data[r]+targetCol,other.data[fromRow+r-targetRow]+fromCol, length);
      }
    }//end if (!loop)
  }//end if (data)
  return *this;
}
//end fillWith<T>(here,other<T>,from,size)

template<typename T> template<typename U>
inline Matrix<T>& Matrix<T>::fillWith(size_t targetRow, size_t targetCol,
		               const Matrix<U>& other,
			       size_t fromRow, size_t fromCol,
		               size_t height, size_t width, bool loop)
{
  if (data && other.getData())
  {
    if (loop)
    {
      const size_t lastRow = targetRow+height;
      const size_t lastCol = targetCol+width;
      T* pLine = 0;
      const U* pLineOther = 0;
      for(size_t r=targetRow ; r<lastRow ; ++r)
      {
	pLine = data[r%this->height];
	pLineOther = other[(fromRow+r-targetRow)%other.getHeight()];
        for(size_t c=targetCol ; c<lastCol ; ++c)
          pLine[c%this->width] = static_cast<T>(pLineOther[(fromCol+c-targetCol)%other.getWidth()]);
      }
    }//end if (loop)
    else //if (!loop)
    {
      if ((targetRow<this->height) && (targetCol<this->width) &&
          (fromRow<other.getHeight()) && (fromCol<other.getWidth()))
      {
        const size_t lastRow = std::min(this->height,targetRow+height);
        size_t lastCol = std::min(this->width,targetCol+width);
        if (fromCol+lastCol-targetCol>other.getWidth())
          lastCol -= (fromCol+lastCol-targetCol)-other.getWidth();
        T* pDataDest = 0;
        const U* pDataSrc = 0;
        for(size_t r=targetRow ; (r<lastRow) && (fromRow+r-targetRow<other.getHeight()) ; ++r)
        {
          pDataDest = data[r]+targetCol;
          pDataSrc  = other.getData()[fromRow+r-targetRow]+fromCol;
          for(size_t c=targetCol ; c<lastCol ; ++c)
            *pDataDest++ = static_cast<T>(*pDataSrc++);
        }
      }
    }//end if (!loop)
  }//end if (data)
  return *this;
}
//end fillWith<U>(here,other<U>,from,size)

template<typename T> template<typename U,typename V>
inline Matrix<T>& Matrix<T>::operator()(U (*f)(const V&))
{
  if (data)
  {
    T* pData = data[0];
    for(size_t i=0,size=height*with ; i<size ; ++i,++pData)
      *pData = static_cast<T>((*f)(*pData));
  }
  return *this;
}
//end operator(U f(const V&))

template<typename T> template<typename U,typename V>
inline Matrix<T>& Matrix<T>::operator()(U (*f)(V))
{
  if (data)
  {
    T* pData = data[0];
    for(size_t i=0,size=height*width ; i<size ; ++i, ++pData)
      *pData = static_cast<T>((*f)(*pData));
  }
  return *this;
}
//end operator(U f(V))

template<typename T> template<typename U>
inline Matrix<T>& Matrix<T>::operator()(const U& f)
{
  if (data)
  {
    T* pData = data[0];
    for(size_t i=0,size=height*width ; i<size ; ++i,++pData)
      *pData = static_cast<T>(f(*pData));
  }
  return *this;
}
//end operator(functer)

#ifdef USE_ALTIVEC
template<> inline float Matrix<float>::sum(void) const
{
  float cumul = 0;
  if (data) cumul = Altivec::vsum(data[0],height*width);
  return cumul;
}
#endif

template<typename T>
inline T Matrix<T>::sum(void) const
{
  T cumul = T();
  if (data)
  {
    const T* pData = data[0];
    for(size_t i=0,size=height*width ; i<size ; ++i)
      cumul += *pData++;
  }
  return cumul;
}
//end sum()

template<typename T>
inline T Matrix<T>::product(void) const
{
  T cumul = T();
  if (data)
  {
    const T* pData = data[0];
    cumul = *pData++;
    for(size_t i=1,size=height*width ; i<size ; ++i)
      cumul *= *pData++;
  }
  return cumul;
}
//end product()

template<typename T>
inline T Matrix<T>::mean(void) const
{
  T mu = T();
  if (data)
  {
    const size_t size = height*width;
    const T* pData = data[0];
    for(size_t i=0 ; i<size ; ++i)
      mu += *pData++;
    mu /= size;
  }
  return mu;
}
//end mean()

template<typename T>
inline T Matrix<T>::max(void) const
{
  T max = T();
  if (data)
  {
    T temp;
    const T* pData = data[0];
    max = *pData++;
    for(size_t i=1,size=height*width ; i<size ; ++i)
    {
      temp = *pData++;
      if (temp>max) max = temp;
    }
  }
  return max;
}
//end max()

template<typename T>
inline T Matrix<T>::min(void) const
{
  T min = T();
  if (data)
  {
    T temp;
    const T* pData = data[0];
    min = *pData++;
    for(size_t i=1,size=height*width ; i<size ; ++i)
    {
      temp = *pData++;
      if (temp<min) min = temp;
    }
  }
  return min;
}
//end min()

#ifdef USE_ALTIVEC
template<> inline float Matrix<float>::norm1(void) const
{
  float cumul = 0;
  if (data) cumul = Altivec::vabssum(data[0],height*width);
  return cumul;
}
#endif

template<typename T>
inline T Matrix<T>::norm1(void) const
{
  T norm = T();
  if (data)
  {
    const T* pData = data[0];
    for(size_t i=0,size=height*width ; i<size ; ++i)
      norm += Matrix<T>::abs(*pData++);
  }
  return norm;
}
//end norm1()

template<typename T>
inline T Matrix<T>::norm2(void) const
{
  T norm = T();
  if (data)
  {
    T temp;
    const T* pData = data[0];
    for(size_t i=0,size=height*width ; i<size ; ++i)
    {
      temp = *pData++;
      norm += temp*temp;
    }
  }
  return static_cast<T>(std::sqrt(static_cast<double>(norm)));
}
//end norm2()

template<typename T>
inline T Matrix<T>::normInf(void) const
{
  T norm = T();
  if (data)
  {
    const T* pData = data[0];
    T temp = *pData++;
    norm = Matrix<T>::abs(temp);
    for(size_t i=1,size=height*width ; i<size ; ++i)
    {
      temp = Matrix<T>::abs(*pData++);
      if (norm<temp) norm = temp;
    }
  }
  return norm;
}
//end normInf()

template<typename T>
inline Matrix<T>& Matrix<T>::oppose(void)
{
  if (data)
  {
    T* pData = data[0];
    for(size_t i=0,size=height*width ; i<size ; ++i,++pData)
    {
      *pData = -(*pData);
    }
  }
  return *this;
}
//end oppose()

template<typename T>
inline Matrix<T>& Matrix<T>::inverse(void)
{
  if (data)
	{
	  T temp;
	  T* pData = data[0];
		for(size_t i=0,size=height*width ; i<size ; ++i,++pData)
		{
		  temp = *pData;
      *pData = static_cast<T>(1)/temp;
		}
	}
	return *this;
}
//end inverse()

#ifdef USE_ALTIVEC
template<>
inline Matrix<float>& Matrix<float>::sqr(void)
{
  if (data) Altivec::vsqr(data[0],data[0],height*width);
	return *this;
}
//end sqr()
#endif

template<typename T>
inline Matrix<T>& Matrix<T>::sqr(void) {return ((*this)*=(*this));}
//end sqr()

template<typename T>
inline Matrix<T>& Matrix<T>::abs(void)
{
  if (data)
  {
    T temp;
    T* pData = data[0];
    for(size_t i=0,size=height*width ; i<size ; ++i)
    {
      temp = *pData;
      *pData++ = Matrix<T>::abs(temp);
    }
  }
  return *this;
}
//end abs()

#ifdef USE_ALTIVEC
template<>
inline Matrix<float>& Matrix<float>::mulAdd(const Matrix<float>& m1, const Matrix<float>& m2)
{
  if (data && m1.getData() && m2.getData())
	{
	  const size_t minHeight =
		                   std::min(height,std::min(m1.getHeight(),m2.getHeight()));
	  if ((width == m1.getWidth()) && (width == m2.getWidth()))
		{
		  Altivec::vam(m1.getData()[0],m2.getData()[0],data[0],data[0],minHeight*width);
		}
		else
		{
		  const size_t minWidth =
	                   std::min(width,std::min(m1.getWidth(),m2.getWidth()));
		  for(size_t h=0 ; h<minHeight ; ++h)
			{
		    float* pData = data[h];
	      const float* pData1 = m1.getData()[h];
        const float* pData2 = m2.getData()[h];
			  for(size_t w=0 ; w<minWidth ; ++w)
			    *pData++ *= (*pData1++) + (*pData2++);
			}
		}//end if different widths
	}//end if data
	return *this;
}
//end mulAdd()
#endif

template<typename T>
template<typename U,typename V>
inline Matrix<T>& Matrix<T>::mulAdd(const Matrix<U>& m1, const Matrix<V>& m2)
{
  if (data && m1.getData() && m2.getData())
	{
	  const size_t minHeight =
		                   std::min(height,std::min(m1.getHeight(),m2.getHeight()));
		T* pData = 0;
	  const U* pData1 = 0;
	  const V* pData2 = 0;
	  if ((width == m1.getWidth()) && (width == m2.getWidth()))
		{
		  pData = data[0];
	    pData1 = m1.getData()[0];
      pData2 = m2.getData()[0];
		  for(size_t i=0,size=minHeight*width ; i<size ; ++i)
			  *pData++ *= static_cast<T>(*pData1++) + static_cast<T>(*pData2++);
		}
		else
		{
		  const size_t minWidth =
	                   std::min(width,std::min(m1.getWidth(),m2.getWidth()));
		  for(size_t h=0 ; h<minHeight ; ++h)
			{
		    pData = data[h];
	      pData1 = m1.getData()[h];
        pData2 = m2.getData()[h];
			  for(size_t w=0 ; w<minWidth ; ++w)
			    *pData++ *= static_cast<T>(*pData1++) + static_cast<T>(*pData2++);
			}
		}//end if different widths
	}//end if data
	return *this;
}
//end mulAdd()

template<typename T>
inline Matrix<T>& Matrix<T>::pow(long n)
{
  if (data)
	{
	  if (n == 0)
		  this->fillWith(static_cast<T>(1));
		else if (n<0)
		  this->pow(-n).inverse();
		else if (n>1)
		{
		  if (n%2)
			{
			  Matrix<float> current(*this);
			  (this->pow(n/2)).sqr() *= current;
			}
			else
			  (this->pow(n/2)).sqr();
		}
	}
	return *this;
}
//end pow(long)

template<typename T>
inline Matrix<T>& Matrix<T>::pow(unsigned long n)
{
  if (data)

	{
	  if (n == 0)
		  this->fillWith(static_cast<T>(1));
		else if (n>1)
		{
		  if (n%2)
			{
			  Matrix<float> current(*this);
			  (this->pow(n/2)).sqr() *= current;
			}
			else
			  (this->pow(n/2)).sqr();
		}
	}
	return *this;
}
//end pow(unsigned long)

template<typename T>
inline Matrix<T>& Matrix<T>::pow(double y)
{
  if (data)
  {
    T temp;
    T* pData = data[0];
    for(size_t i=0,size=height*width ; i<size ; ++i,++pData)
    {
      temp = *pData;
      *pData = static_cast<T>(std::pow(static_cast<double>(temp),y));
    }
  }
  return *this;
}
//end pow(double)

#ifdef USE_ALTIVEC
template<> inline pair<float,float> Matrix<float>::meanAndStandardDeviation(void) const
{
  pair<float,float> muAndSigma;
  if (data) muAndSigma = Altivec::meanAndStandardDeviation(data[0],height*width);
  return muAndSigma;
}
#endif

template<typename T>
inline pair<T,T> Matrix<T>::meanAndStandardDeviation(void) const
{
  T mu = T();
  T sigma = T();
  const size_t size = height*width;
  if (size)
  {
    T temp = T();
    const T* pData = data[0];
    for(size_t i=0 ; i<size ; ++i)
    {
      temp   = *pData++;
      mu    += temp;
      sigma += temp*temp;
    }
    mu /= size;
    sigma /=size;
    sigma -= mu*mu;
    sigma = static_cast<T>(std::sqrt(static_cast<double>(sigma)));
  }
  return make_pair(mu,sigma);
}
//end meanAndStandardDeviation()

template<typename T>
inline bool Matrix<T>::any(void) const
{
  bool found = false;
  if (data)
  {
    const T* pData = data[0];
    for(size_t i=0,size=height*width; !found && (i<size) ; ++i)
      found = static_cast<bool>(*pData++);
  }
  return found;
}
//end any()

template<typename T>
inline bool Matrix<T>::every(void) const
{
  bool all = false;
  if (data)
  {
    all = true;
    const T* pData = data[0];
    for(size_t i=0,size=height*width; all && (i<size) ; ++i)
      all &= static_cast<bool>(*pData++);
  }
  return all;
}
//end every()

template<typename T>
inline Matrix<T>& Matrix<T>::writeToStream(ostream& stream) const
{
  stream << height << 'x' << width << ':';
  const size_t size = height*width;
  if (size)
  {
    const T* pData = data[0];
    for(size_t i=0 ; i<size ; ++i)
      stream << *pData++ << ',';
  }
  return const_cast<Matrix<T>&>(*this);
}
//end writeToStream();

template<typename T>
inline Matrix<T>& Matrix<T>::readFromStream(istream& stream)
{
  clean();
  char x = '\0';
  char ddots = '\0';
  size_t tmpHeight = 0;
  size_t tmpWidth  = 0;
  stream >> tmpHeight >> x >> tmpWidth >> ddots;
  if (stream.good() && (x == 'x') && (ddots == ':'))
  {
    height = tmpHeight;
    width  = tmpWidth;
    data = allocation(height,width);
    T* pData = data[0];
    char comma = ',';
    for(size_t i=0, size=height*width ; stream.good() && (comma == ',') && (i<size) ; ++i)
      stream >> *pData++ >> comma;
  }
  return *this;
}
//end readFromSTream()

template<typename T>
inline ostream& operator<<(ostream& stream, const Matrix<T>& m)
{
  const T* pData = (m.getData() ? m.getData()[0] : 0);
  for(size_t h=0 ; h<m.getHeight() ; ++h)
  {
    for(size_t w=0 ; w<m.getWidth() ; ++w)
    {
      stream << *pData++ << '\t';
    }
    stream << '\n';
  }
  return stream;
}
//end operator<<()

//FFT2D
#ifdef USE_ALTIVEC
template<>
inline void Matrix<float>::FFT2D(Matrix<float>* Output_real, Matrix<float>* Output_imag, const Matrix<float>& Input_real, const Matrix<float>& Input_imag,
const FFTSetup& setup)
{
  if (!Matrix<float>::CheckFFTOk(Output_real,Output_imag,Input_real,Input_imag)) throw BadFFT();
  DSPSplitComplex  inData = {Input_real.data[0]  , Input_imag.data[0]};
  DSPSplitComplex outData = {Output_real->data[0], Output_imag->data[0]};
  int log2InRow = Matrix<float>::LogTwoOfCeilingPowerOfTwo(Input_real.height)-1;
  int log2InCol = Matrix<float>::LogTwoOfCeilingPowerOfTwo(Input_real.width)-1;
  DSPSplitComplex bufferTemp = {0,0};
  bufferTemp.realp = new float[16*1024/sizeof(float)];
  bufferTemp.imagp = new float[16*1024/sizeof(float)];
  ::fft2d_zopt(setup, &inData, 1, 0, &outData, 1, 0,
               &bufferTemp, log2InCol, log2InRow, FFT_FORWARD);

  if (bufferTemp.realp != 0) delete [] bufferTemp.realp;
  if (bufferTemp.imagp != 0) delete [] bufferTemp.imagp;
}
//end FFT2D(float) ALTIVEC

template<>
inline void Matrix<float>::FFT2D(Matrix<float>* Output_real, Matrix<float>* Output_imag, const Matrix<float>& Input_real, const Matrix<float>& Input_imag)
{
  FFTSetup setup =
	  create_fftsetup(Matrix<float>::LogTwoOfCeilingPowerOfTwo(
		                std::max(Input_real.getHeight(),Input_real.getWidth())),
		                FFT_RADIX2);
	if (setup)
	{
	  Matrix<float>::FFT2D(Output_real,Output_imag,Input_real,Input_imag,setup);
	  destroy_fftsetup(setup);
	}
	else
	  throw bad_alloc();
}

template<typename T>
inline void Matrix<T>::FFT2D(Matrix<T>* Output_real, Matrix<T>* Output_imag,
                     const Matrix<T>& Input_real, const Matrix<T>& Input_imag,
                     const FFTSetup& setup)
{
  if (!Matrix<T>::CheckFFTOk(Output_real,Output_imag,Input_real,Input_imag)) throw BadFFT();
  FFT2D(Output_real, Output_imag, Input_real, Input_imag);
}
//end FFT2D ALTIVEC
#endif

//FFT2D
template<typename T>
inline void Matrix<T>::FFT2D(Matrix<T>* Output_real, Matrix<T>* Output_imag,
                      const Matrix& Input_real, const Matrix& Input_imag)
{
  if (!Matrix<T>::CheckFFTOk(Output_real,Output_imag,Input_real,Input_imag)) throw BadFFT();
  Matrix<T> buffer_real(Input_real);
  Matrix<T> buffer_imag(Input_imag);
  Four2(Output_real,Output_imag,&buffer_real,&buffer_imag, 1);//2-D FFT
}
//end FFT2D()

//IFFT2D
#ifdef USE_ALTIVEC
template<>
inline void Matrix<float>::IFFT2D(Matrix<float>* Output_real, Matrix<float>* Output_imag,
                           const Matrix<float>& Input_real, const Matrix<float>& Input_imag,
                           const FFTSetup& setup)
{
  if (!Matrix<float>::CheckFFTOk(Output_real,Output_imag,Input_real,Input_imag)) throw BadFFT();
  DSPSplitComplex  inData = {Input_real.data[0]  , Input_imag.data[0]};
  DSPSplitComplex outData = {Output_real->data[0], Output_imag->data[0]};
  int log2InRow = Matrix<float>::LogTwoOfCeilingPowerOfTwo(Input_real.height)-1;
  int log2InCol = Matrix<float>::LogTwoOfCeilingPowerOfTwo(Input_real.width)-1;
  DSPSplitComplex bufferTemp = {0,0};
  bufferTemp.realp = new float[16*1024/sizeof(float)];
  bufferTemp.imagp = new float[16*1024/sizeof(float)];
  ::fft2d_zopt(setup, &inData, 1, 0, &outData, 1, 0,
               &bufferTemp, log2InCol, log2InRow, FFT_INVERSE);
  if (bufferTemp.realp != 0) delete [] bufferTemp.realp;
  if (bufferTemp.imagp != 0) delete [] bufferTemp.imagp;
  const size_t size = Input_real.height*Input_real.width;
  Altivec::vmul(Output_real->data[0], 1.0/size, Output_real->data[0], size);
  Altivec::vmul(Output_imag->data[0], 1.0/size, Output_imag->data[0], size);
}

template<>
inline void Matrix<float>::IFFT2D(Matrix<float>* Output_real, Matrix<float>* Output_imag, const Matrix<float>& Input_real, const Matrix<float>& Input_imag)
{
  FFTSetup setup =
	  create_fftsetup(Matrix<float>::LogTwoOfCeilingPowerOfTwo(
		                std::max(Input_real.getHeight(),Input_real.getWidth())),
		                FFT_RADIX2);
	if (setup)
	{
	  Matrix<float>::IFFT2D(Output_real,Output_imag,Input_real,Input_imag,setup);
	  destroy_fftsetup(setup);
	}
	else throw bad_alloc();
}
//end IFFT2D(float) ALTIVEC

template<typename T>
inline void Matrix<T>::IFFT2D(Matrix<T>* Output_real, Matrix<T>* Output_imag,
                   const Matrix<T>& Input_real, const Matrix<T>& Input_imag,
                   const FFTSetup& setup)
{
  if (!Matrix<T>::CheckFFTOk(Output_real,Output_imag,Input_real,Input_imag)) throw BadFFT();
  IFFT2D(Output_real, Output_imag, Input_real, Input_imag);
}
//end IFFT2D() ALTIVEC
#endif

//IFFT2D
template<typename T>
inline void Matrix<T>::IFFT2D(Matrix<T>* Output_real, Matrix<T>* Output_imag,
                       const Matrix<T>& Input_real, const Matrix<T>& Input_imag)
{
  if (!Matrix<T>::CheckFFTOk(Output_real,Output_imag,Input_real,Input_imag)) throw BadFFT();
  Matrix<T> buffer_real(Input_real);
  Matrix<T> buffer_imag(Input_imag);
  Matrix<T>::Four2(Output_real,Output_imag,&buffer_real,&buffer_imag, -1);//2-D FFT
  const size_t size = Input_real.getHeight()*Input_real.getWidth();
  (*Output_real) /= size;
  (*Output_imag) /= size;
}
//end IFFT2D()

//Private utility for FFT2D
template <typename T>
inline void Matrix<T>::Four2(Matrix<T>* fftr_, Matrix<T>* ffti_,
	              Matrix<T>* rdata_, Matrix<T>* idata_, int isign)
{
  Matrix<T>& fftr  = *fftr_;
  Matrix<T>& ffti  = *ffti_;
  Matrix<T>& rdata = *rdata_;
  Matrix<T>& idata = *idata_;

  size_t height = rdata.getHeight();
  size_t width  = rdata.getWidth();

  Matrix<T> tmp1(1,2*width);
  Matrix<T> tmp2(1,2*height);
  Matrix<T> tmp(2*height,width);
  T* p_tmp1 = 0;
  T* p_tmp2 = 0;
  T* p_rdata = 0;
  T* p_idata = 0;
  for(size_t i=0 ; i<height ; ++i)
  {
    p_rdata = rdata[i];
    p_idata = idata[i];
    p_tmp1  = tmp1.getData()[0];
    for (size_t j=0 ; j<width ; ++j)
    {
      *p_tmp1++ = *p_rdata++;
      *p_tmp1++ = *p_idata++;
    }
    
    Four1(&tmp1, isign);

    p_rdata = tmp[i*2];
    p_idata = tmp[i*2+1];
    p_tmp1  = tmp1.getData()[0];
    for (size_t j=0 ; j<width ; ++j)
    {
      *p_rdata++ = *p_tmp1++;
      *p_idata++ = *p_tmp1++;
    }
  }

  for (size_t i=0 ; i<width ; ++i)
  {
    p_tmp2 = tmp2.getData()[0];
    for (size_t j=0 ; j<height ; ++j)
    {
      *p_tmp2++ = tmp[2*j][i];
      *p_tmp2++ = tmp[2*j+1][i];
    }

    Four1(&tmp2,isign);

    p_tmp2 = tmp2.getData()[0];
    for (size_t j=0 ; j<height ; ++j)
    {
      fftr[j][i] = *p_tmp2++;
      ffti[j][i] = *p_tmp2++;
    }
  }
}
//end Four2()

//Private utility for FFT
template<typename T>
inline void Matrix<T>::Four1(Matrix<T>* data_, int isign)
{
  T* data = data_->getData()[0];
  const size_t size = data_->getWidth();

  T wtemp = T();
  T wr    = T();
  T wpr   = T();
  T wpi   = T();
  T wi    = T();
  T theta = T();
  T tempr = T();
  T tempi = T();

  size_t m = 0;
  size_t j = 1;
  for(size_t i=0 ; i+1<size ; i+=2)
  {
    if (j-1 > i)
    {
      //swap data[i] and data[j]
      std::swap(data[i],data[j-1]);
      //swap data[i+1] and data[j+1]
      std::swap(data[i+1],data[j]);
    }
    m = size/2;
    while ((m>=2) && (j>m))
    {
      j -= m;
      m /= 2;
    }
    j += m;
  }

  size_t mmax = 2;
  size_t i = 0;
  size_t istep = 0;
  while (mmax<size)
  {
    istep = 2*mmax;
    theta = static_cast<T>(6.28318530717959/(isign*static_cast<T>(mmax)));
    wtemp = static_cast<T>(std::sin(static_cast<double>(0.5*theta)));
    wpr   = static_cast<T>(-2.0*wtemp*wtemp);
    wpi   = static_cast<T>(std::sin(static_cast<double>(theta)));
    wr    = static_cast<T>(1);
    wi    = static_cast<T>(0);
    for (m=0 ; m+1<mmax ; m+=2)
    {
      for (i=m ; i<size ; i+=istep)
      {
        j          = i+mmax;
        tempr      = wr*data[j]-wi*data[j+1];
        tempi      = wr*data[j+1]+wi*data[j];
        data[j]    = data[i]-tempr;
        data[j+1]  = data[i+1]-tempi;
        data[i]   += tempr;
        data[i+1] += tempi;
      }
      wtemp = wr;
      wr = wtemp*wpr-wi*wpi+wr;
      wi = wi*wpr+wtemp*wpi+wi;
    }
    mmax = istep;
  }
}
//end Four1()

template <typename T>
inline bool Matrix<T>::CheckFFTOk(const Matrix<T>* Output_real, const Matrix<T>* Output_imag,
		             const Matrix<T>& Input_real , const Matrix<T>& Input_imag)
{
  bool ok = (Output_real && Output_imag && 
	           Input_real.getData() && Input_imag.getData() &&
             (Output_real->getHeight() == Output_imag->getHeight()) &&
             (Output_imag->getHeight() == Input_imag.getHeight()) &&
             (Input_imag.getHeight() == Input_real.getHeight()) &&
             (Output_real->getWidth() == Output_imag->getWidth()) &&
             (Output_imag->getWidth() == Input_imag.getWidth()) &&
             (Input_imag.getWidth() == Input_real.getWidth())
					  );
  if (ok)
  {
    const size_t height = Input_real.getHeight();
    const size_t width  = Input_real.getWidth();
    ok = !(height & (height-1)) && !(width & (width-1)); //power of two ?
  }
  return ok;
}
//end CheckFFTOk()

template<> inline bool Matrix<bool>::abs(const bool& x) {return x;}
template<> inline unsigned char  Matrix<unsigned char >::abs(const unsigned char&  x) {return x;}
template<> inline unsigned short Matrix<unsigned short>::abs(const unsigned short& x) {return x;}
template<> inline unsigned int   Matrix<unsigned int  >::abs(const unsigned int&   x) {return x;}
template<> inline unsigned long  Matrix<unsigned long >::abs(const unsigned long&  x) {return x;}
template<typename T> inline T Matrix<T>::abs(const T& x) {return ((x<0) ? -x : x);}

#ifdef USE_ALTIVEC
template<> inline float Matrix<float>::dotProduct(const Matrix<float>& other) const
{
  float cumul = 0;
  if (data && other.data)
  {
    const size_t minSize = std::min(height*width,other.getHeight()*other.getWidth());
    cumul = Altivec::vdotpr(data[0],other.data[0],minSize);
  }
  return cumul;
}
//end dotProduct()

template<>
inline float Matrix<float>::DotProduct(const float* data1,
		                       const float* data2, size_t size)
{
  return Altivec::vdotpr(data1,data2,size);
}
//end DotProduct(data1,data2,size)
#endif


template<typename T> template<typename U>
inline T Matrix<T>::DotProduct(const T* data1, const U* data2, size_t size)
{
  T cumul = T();
  for(size_t i=0 ; i<size ; ++i)
    cumul += (*data1++)*static_cast<T>(*data2++);
  return cumul;
}
//end DotProduct(data1,data2,size)

template<typename T> template<typename U>
inline T Matrix<T>::dotProduct(const Matrix<U>& other) const
{
  T cumul = T();
  if (data && other.getData())
  {
    const size_t minSize = std::min(height*width,other.getHeight()*other.getWidth());
    const T* pData1 = data[0];
    const U* pData2 = other.getData()[0];
    cumul = Matrix<T>::DotProduct(pData1,pData2,minSize);
  }
  return cumul;
}
//end dotProduct()

template<typename T> template<typename U>
inline Matrix<T> Matrix<T>::mulByVec(const Matrix<U>& lineMatrix) const
{
  Matrix<T> result;
  if (data && lineMatrix.getData())
  {
    result.resize(1,height);
    const size_t minWidth = std::min(width,lineMatrix.getWidth());
    for(size_t h=0 ; h<height ; ++h)
      result[0][h] = Matrix<T>::DotProduct(data[h],lineMatrix.getData()[0],minWidth);
  }
  return result;
}
//end mulByVec()

template<typename T>
size_t Matrix<T>::LogTwoOfCeilingPowerOfTwo(size_t x)
{
  size_t result = 0;
  if (x == 0)
    result  = 1;
  else 
  {
    result = (x&(x-1)) ? 1 : 0; //x&(x-1) means "not power of two"
    while(x)
    {
      x >>= 1;
      ++result;
    }
  }
  return result;
}
//end LogTwoOfCeilingPowerOfTwo()

#endif

