/*
inthdl_gcd.c
*/

#include "defs.h"
#include "inthdl.e"
#include "intbig.h"


/*
Shift right xh into yh by the specified number of digits and bits:
*/

#define shift_right(xh, yh, digits, bits)				\
{									\
    inthdl_length	i_, n_ = digits;				\
    t_int	k_ = bits, outk_ = ZETA - k_, outm_ = (1 << k_) - 1; \
    inthdl_length	len1_ = intbig_curr_size(xh) - 1;		\
    t_int	*p_ = intbig_dig0_ptr(xh);			\
    t_int	*q_ = intbig_dig0_ptr(yh);			\
									\
    for (i_ = 0; n_ < len1_; i_++, n_++)				\
	q_[i_] = (p_[n_] >> k_) | ((p_[n_ + 1] & outm_) << outk_);	\
    if (n_ <= len1_ && (q_[i_] = p_[n_] >> k_))				\
	i_++;								\
    intbig_curr_size(yh) = i_;						\
}

void
inthdl_gcd	WITH_3_ARGS(
    inthdl_handle,	x,
    inthdl_handle,	y,
    inthdl_handle,	z
)
/*
Given two multi-precision integer blocks x and y containing POSITIVE integers,
return their greatest common divisor in pre-allocated integer block z, which
must have room for at least min(intbig_curr_size(x), intbig_curr_size(y))
digits.  (Uses the binary method unless x or y is relatively small in which
case the Lehmer method is used - see inthdl_gcd_lehmer().)
*/
{
    Logical		did_mod;
    t_int	i, j, k, n, d, xn, yn, xj, yj;
    inthdl_length	xlen, ylen, ulen, vlen, len;
    inthdl_handle	mod_buf, t, u, v;
    void		inthdl_gcd_lehmer();

    DENY(intbig_sign(x) <= 0 || intbig_sign(y) <= 0);

    xlen = intbig_curr_size(x);
    ylen = intbig_curr_size(y);

    /*
    The following constants have been determined empirically:
    */

    if (xlen <= 10 || ylen <= 10)
    {
	inthdl_gcd_lehmer(x, y, z);
	return;
    }


    /*
    Use the binary method:
    */

    DEBUG_INTHDL_2("+inthdl_gcd_binary", x, y);

    /*
    If the sizes of x and y differ by more than 4 digits, reduce the larger
    one modulo the other first.
    */

    if (xlen > ylen + 4)
    {
	mod_buf = inthdl_buf_alloc(ylen);
	inthdl_rem(x, y, mod_buf);
	
	if (!intbig_sign(mod_buf))
	{
	    intbig_copy_digits(y, 0, ylen, z, 0);
	    intbig_curr_size(z) = ylen;
	    intbig_sign(z) = 1;
	    inthdl_buf_delete(mod_buf);

	    DEBUG_INTHDL_1("-inthdl_gcd_binary", z);
	    return;
	}

	x = mod_buf;
	xlen = intbig_curr_size(x);
	did_mod = TRUE;
    }
    else if (ylen > xlen + 4)
    {
	mod_buf = inthdl_buf_alloc(xlen);
	inthdl_rem(y, x, mod_buf);

	if (!intbig_sign(mod_buf))
	{
	    intbig_copy_digits(x, 0, xlen, z, 0);
	    intbig_curr_size(z) = xlen;
	    intbig_sign(z) = 1;
	    inthdl_buf_delete(mod_buf);

	    DEBUG_INTHDL_1("-inthdl_gcd_binary", z);
	    return;
	}

	y = mod_buf;
	ylen = intbig_curr_size(y);
	did_mod = TRUE;
    }
    else
	did_mod = FALSE;

    /*
    Find maximal power of 2 in x and y to shift out.
    */

    xn = 0;
    while ((d = intbig_digit(x, xn)) == 0)
	xn++;

    xj = 0;
    while ((d & 1) == 0)
    {
	d >>= 1;
	xj++;
    }

    yn = 0;
    while ((d = intbig_digit(y, yn)) == 0)
	yn++;

    yj = 0;
    while ((d & 1) == 0)
    {
	d >>= 1;
	yj++;
    }

    /*
    Initialize u and t to the shifted versions of x and y respectively.
    */

    ulen = xlen - xn;
    vlen = ylen - yn;
    len = (ulen > vlen)? ulen: vlen;

    u = inthdl_buf_alloc(len);
    v = inthdl_buf_alloc(len);
    t = inthdl_buf_alloc(len);

    if (xn || xj)
    {
	shift_right(x, u, xn, xj);
    }
    else
    {
	intbig_copy_digits(x, 0, xlen, u, 0);
	intbig_curr_size(u) = xlen;
    }

    if (yn || yj)
    {
	shift_right(y, v, yn, yj);
    }
    else
    {
	intbig_copy_digits(y, 0, ylen, v, 0);
	intbig_curr_size(v) = ylen;
    }

    /*
    Set n to the number of digits and j to the number of bits which need to
    be shifted back into the result.
    */

    if (xn < yn || xn == yn && xj < yj)
	n = xn, j = xj;
    else
	n = yn, j = yj;

    intbig_sign(u) = 1;

    for (;;)
    {
	/*
	Set t = u - v.
	*/

	intbig_sign(v) = -1;
	inthdl_add(u, v, t);
	intbig_sign(v) = 1;

	if (!intbig_sign(t))
	    break;

	/*
	Shift out the maximal number of zero bits from t.
	*/

	i = 0;

	while ((d = intbig_digit(t, i)) == 0)
	    i++;

	k = 0;
	while ((d & 1) == 0)
	{
	    d >>= 1;
	    k++;
	}

	if (i || k)
	    shift_right(t, t, i, k);

	/*
	Replace the larger of u and v by t (for the first time through the
	loop the smaller may be replaced but this doesn't matter).
	*/

	if (intbig_sign(t) > 0)
	{
	    inthdl_handle	temp;

	    temp = u;
	    u = t;
	    t = temp;
	}
	else
	{
	    inthdl_handle	temp;

	    temp = v;
	    v = t;
	    t = temp;
	}
    }

    ulen = intbig_curr_size(u);

    if (j || n)
    {
	/*
	Shift u right n digits and j bits into z.
	*/

	t_int	*p = intbig_dig0_ptr(u);
	t_int	*q = intbig_dig0_ptr(z);
	t_int	botm, high, top_j = ZETA - j;

	botm = (1 << top_j) - 1;

	intbig_curr_size(z) = ulen + n;
	if (j && (high = p[ulen - 1] >> top_j))
	{
	    q[ulen + n] = high;
	    intbig_curr_size(z)++;
	}

	for (i = ulen - 1; i >= 1; i--)
	    q[i + n] = ((p[i] & botm) << j) | (p[i - 1] >> top_j);

	q[n] = (p[0] & botm) << j;

	for (i = n - 1; i >= 0; i--)
	    q[i] = 0;
    }
    else
    {
	/*
	Copy u directly into z.
	*/

	intbig_copy_digits(u, 0, ulen, z, 0);
	intbig_curr_size(z) = ulen;
    }

    intbig_sign(z) = 1;

    inthdl_buf_delete(t);
    inthdl_buf_delete(u);
    inthdl_buf_delete(v);

    if (did_mod)
	inthdl_buf_delete(mod_buf);

    DEBUG_INTHDL_1("-inthdl_gcd_binary", z);
}




#undef A

static void
inthdl_lincomb_gcd	WITH_5_ARGS(
    inthdl_handle,	u,
    inthdl_handle,	v,
    intbig_medium,	A,
    intbig_medium,	B,
    inthdl_handle,	t
)
/*
Calculate t = A * u + B * v, where u and v are positive with v no longer than
u, and we know that t will be less than v and non-negative, though A and B may
have any sign.
*/
{
    register inthdl_sign	Asign, Bsign;
    register inthdl_length	ulen, vlen;
    register t_int	*up, *vp, *tp, *ulimit, *vlimit, *nonzerop;
    register intbig_medium	carry;

    DEBUG_INTHDL_BETA("+inthdl_lincomb_gcd: u, A", u, A);
    DEBUG_INTHDL_BETA("      lincomb_gcd: v, B", v, B);

    if (A >= 0)
	Asign = 1;
    else
    {
	Asign = -1;
	A = -A;
    }

    if (B >= 0)
	Bsign = 1;
    else
    {
	Bsign = -1;
	B = -B;
    }

    up = intbig_dig0_ptr(u);
    vp = intbig_dig0_ptr(v);
    tp = intbig_dig0_ptr(t);
    nonzerop = tp;

    ulen = intbig_curr_size(u);
    vlen = intbig_curr_size(v);

    ulimit = up + ulen;
    vlimit = vp + vlen;

    carry = 0;

    while (vp < vlimit)
    {
	intbig_medium		s;
	intbig_medium		a0, a1, b0, b1;

	ib_mult(A, *up++, &a1, &a0);
	ib_mult(B, *vp++, &b1, &b0);

	if (Asign < 0)
	    a0 = -a0, a1 = -a1;

	if (Bsign < 0)
	    b0 = -b0, b1 = -b1;

	s = a0 + b0 + carry;
	carry = a1 + b1;

	while (s >= BETA)
	{
	    s -= BETA;
	    carry++;
	}

	while (s < 0)
	{
	    s += BETA;
	    carry--;
	}

	if (*tp++ = s)
	    nonzerop = tp;
    }

    while (up < ulimit)
    {
	intbig_medium		s;
	intbig_medium		a0, a1;

	ib_mult(A, *up++, &a1, &a0);

	if (Asign < 0)
	    a0 = -a0, a1 = -a1;

	s = a0 + carry;
	carry = a1;
	if (s >= BETA)
	{
	    s -= BETA;
	    carry++;
	}

	if (*tp++ = s)
	    nonzerop = tp;
    }

    if (intbig_curr_size(t) = nonzerop - intbig_dig0_ptr(t))
	intbig_sign(t) = 1;
    else
	intbig_sign(t) = 0;

    DEBUG_INTHDL_1("-inthdl_lincomb_gcd", t);

    DENY(carry);
}


void
inthdl_gcd_lehmer	WITH_3_ARGS(
    inthdl_handle,	x,
    inthdl_handle,	y,
    inthdl_handle,	z
)
/*
Given two multi-precision integer blocks x and y containing POSITIVE integers,
return their greatest common divisor in pre-allocated integer block z, which
must have room for at least min(intbig_curr_size(x), intbig_curr_size(y))
digits. (Uses the Lehmer method which seems to be faster than the binary
method when x or y is relatively small.)
*/
{
    /*
    Lehmer's method as described in Knuth's Seminumerical Algorithms (vol 2),
    Second Edition, Algorithm L, 4.5.2, p. 329

    We use single-precision working storage with LAMBDA binary digits (LAMBDA
    is called p in Knuth's exposition).  As Knuth points out, the only case of
    single-precision overflow that can occur below is addition of 2**LAMBDA-1
    (uhat or vhat) to 1 (A or D).  Because 3*BETA = 3*(2**ZETA) is represent-
    able as a machine integer (see intbig.h), we can use one more bit
    than allowed for by ZETA.  So it is safe to define LAMBDA by the following:
    */

#define LAMBDA	(ZETA+1)

    inthdl_handle	u;
    inthdl_handle	v;
    inthdl_handle	t;
    inthdl_handle	w;
    inthdl_length	ulen;
    inthdl_length	vlen;
    intbig_medium	A, B, C, D;

    DEBUG_INTHDL_2("+inthdl_gcd_lehmer", x, y);

    DENY(intbig_sign(x) <= 0 || intbig_sign(y) <= 0);

    if (inthdl_compare(x, y) >= 0)
    {
	t = x;
	w = y;
    }
    else
    {
	t = y;
	w = x;
    }

    ulen = intbig_curr_size(t);	/* the greater length */
    u = inthdl_buf_alloc(ulen);
    intbig_copy_digits(t, 0, ulen, u, 0);
    intbig_curr_size(u) = ulen;
    intbig_sign(u) = 1;

    v = inthdl_buf_alloc(ulen);	/* all working storage same length */
    vlen = intbig_curr_size(v) = intbig_curr_size(w);
    intbig_copy_digits(w, 0, vlen, v, 0);
    intbig_sign(v) = 1;

    /*  now u >= v  */

    t = inthdl_buf_alloc(ulen);
    w = z;

    while (vlen && ulen > 1)
    {
	B = 0;

	if (vlen == ulen || (vlen == ulen-1 /*** && ***/
			    /* u < WHAT??? * v */
/* *** intbig_digit(u,ulen-1) < 2 * intbig_digit(v,vlen-1) *** */)
	)
	{
	    intbig_medium	uhat, vhat;
	    inthdl_length	nshift;

	    /*
    JUSTIFY??
	    Because of the above condition on length and high digits of u and v,
	    we have floor(log2(u)) <= floor(log2(v)) + LAMBDA +- 1,
	    so a Lehmer step is worthwhile in the sense that when we
	    calculate uhat=floor(u/2**k) with k minimal so that
	    uhat < 2**LAMBDA, then vhat=floor(v/2**k) is nonzero.
	    */

	    uhat = intbig_digit(u, ulen-1);

#if 0
	    DEBUG_INTHDL_PRINTF("uhat starts at %d = 0x%x\n", uhat, uhat);
#endif

	    /*
	    shift uhat (which is nonzero) left until bit number LAMBDA-1 is
	    one, i.e. until uhat & (1<<(LAMBDA-1)) == 1; record the number of
	    positions shifted in nshift
	    */

	    nshift = 0;

	    /*
	    By the "nbits high-order bits" of uhat where 1 <= nbits <= LAMBDA,
	    we mean the bits numbered LAMBDA-nbits, ..., LAMBDA-1, that is,
	    the bits in the mask ~ ((1<<(LAMBDA-nbits)) - 1 ).

	    Suppose we know that at least one of the high-order 2*nbits of
	    uhat is nonzero (to start with this is true with 2*nbits = ZETA
	    or 2*nbits = ZETA+1 since uhat is nonzero).

	    Then the following macro boost(nbits) will if necessary shift uhat
	    left, so that afterwards we can be sure that at least one of the
	    high-order nbits is nonzero.
	    */

#define boost(nbits)	\
	if ((uhat & ~ ((1<<(LAMBDA-nbits)) - 1)) == 0) \
		{ uhat <<= nbits; nshift += nbits; }

#if	LAMBDA == 30

	    /*
	    Optimization for the commonest case of LAMBDA = 30

	    To start we know uhat is a nonzero beta_digit so one of its 29
	    high-order bits is nonzero
	    */

	    boost(15);  /* Now one of the 15 high-order bits is nonzero */
	    boost(8);  /* Now one of the  8 high-order bits is nonzero */
	    boost(4);  /* Now one of the  4 high-order bits is nonzero */
	    boost(2);  /* Now one of the  2 high-order bits is nonzero */
	    boost(1);  /* Now the high-order bit is nonzero */

#else

	    /*
	    In general, we start with nbits = ceiling(ZETA/2.0) =
	    (ZETA+1(>>1, and keep replacing nbits by ceiling(nbits/2.0)
	    until after we have reached 1
	    */

	    {
		register int	nbits;

		for (nbits = (ZETA+1)>>1; ; nbits = (nbits+1)>>1);
		{
		    boost(nbits);
		    if (nbits <= 1)
			break;
		}
	    }

#endif


	    /*
	    Now shift the nbits high-order bits (from digit ZETA-1 down) of
	    the next digit of u into the low-order bits of uhat
	    */

	    uhat |= intbig_digit(u, ulen-2) >> (ZETA - nshift);

	    /*
	    Form vhat similarly
	    */

	    vhat = vlen == ulen ? intbig_digit(v, vlen-1) << nshift : 0;
	    vhat |= intbig_digit(v, ulen-2) >> (ZETA - nshift);


	    /*
	    Now perform the Lehmer step
	    */

	    {
		register intbig_medium q, T;

		for (A = 1, /* B = 0, */ C = 0, D = 1;
		     vhat + C != 0 && vhat + D != 0 &&
/* ***
??? ALL THESE NUMBERS NON-NEG SO / IS OK FOR FLOOR(/) ???
??? DISCUSS OVERFLOW ???
*** */
			(q = (uhat+A)/(vhat+C)) == (uhat+B)/(vhat+D);
		     /**/
		  )
		{
		    T = A - q*C; A = C; C = T;
		    T = B - q*D; B = D; D = T;
		    T = uhat - q*vhat; uhat = vhat; vhat = T;
		}
	    }
	}

	if (B == 0)
	{
	    /*
	    The Lehmer step was not performed or got nowhere
	    */

	    register inthdl_handle	temphdl;

	    inthdl_quot_rem(u, v, w, t);  /* ignore the quotient w */
	    temphdl = u; u = v; v = t; t = temphdl;
	}
	else
	{
	    register inthdl_handle	temphdl;

	    inthdl_lincomb_gcd(u, v, A, B, t);
	    inthdl_lincomb_gcd(u, v, C, D, w);
	    temphdl = u; u = t; t = temphdl;
	    temphdl = v; v = w; w = temphdl;
	}

	ulen = intbig_sign(u)? intbig_curr_size(u): 0;
	vlen = intbig_sign(v)? intbig_curr_size(v): 0;

    }  /*  end of main while loop  */

    if (vlen == 0)
    {
	intbig_copy_digits(u, 0, ulen, z, 0);
	intbig_curr_size(z) = ulen;
	intbig_sign(z) = intbig_sign(u);
    }
    else
    {
	/*  vlen == ulen == 1  */

	A = intbig_digit(u, 0);
	B = intbig_digit(v, 0);

	/*  Single-precision gcd  */

	while ((C = A % B) != 0)
	{
	    A = B;
	    B = C;
	}

	if (B > 0)
	{
	    intbig_sign(z) = 1;
	    intbig_digit(z, 0) = B;
	}
	else
	{
	    intbig_sign(z) = -1;
	    intbig_digit(z, 0) = -B;
	}
	intbig_curr_size(z) = 1;
    }

    /*
    Now delete the working storage - at this stage, exactly one of u, v, t, w
    is the same as z, but we don't know which.
    */

    if (u != z)
	inthdl_buf_delete(u);

    if (v != z)
	inthdl_buf_delete(v);

    if (t != z)
	inthdl_buf_delete(t);

    if (w != z)
	inthdl_buf_delete(w);

    DEBUG_INTHDL_1("-inthdl_gcd_lehmer", z);
}
