//
// Copyright � 2003..2012 : Henk van Kampen <henk@mediatronix.com>
//
// This file is part of pBlazASM.
//
// pBlazASM is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// pBlazASM is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with pBlazASM.  If not, see <http://www.gnu.org/licenses/>.
//

//
// Rijndael (AES-128) block cipher
// this code assumes 128b data and a 128b key so: Nk = Nn = Nc = 4
// (c) 2003 .. 2012 Henk van Kampen, www.mediatronix.com
//
// based on documents by Dr. Brian Gladman,
//      http://gladman.plushost.co.uk/oldsite/cryptography_technology/rijndael/aes.spec.v316.pdf
// or
//      http://csrc.nist.gov/publications/fips/fips197/fips-197.pdf
//
// test data from [Gladman]
//	PLAINTEXT:    3243f6a8885a308d313198a2e0370734 (pi * 2^124)
//	KEY:          2b7e151628aed2a6abf7158809cf4f3c ( e * 2^124)
// should result in:
//	R[10].k_sch   d014f9a8c9ee2589e13f0cc8b6630ca6
//	R[10].output  3925841d02dc09fbdc118597196a0b32
//
// to be done:
//      decrypt
//

// SBOX I/O device, just a look-up ROM
SBOX_ROM            .EQU       0xF0                 // uSBOX.VHD is instatiated at this port address

// key value
inkey               .BYT       0x2B, 0x7E, 0x15, 0x16, 0x28, 0xAE, 0xD2, 0xA6, 0xAB, 0xF7, 0x15, 0x88, 0x09, 0xCF, 0x4F, 0x3C

// plaintext, key  and result
plain               .BYT       0x32, 0x43, 0xF6, 0xA8, 0x88, 0x5A, 0x30, 0x8D, 0x31, 0x31, 0x98, 0xA2, 0xE0, 0x37, 0x07, 0x34
key                 .BUF       16
result              .BUF       16

// state buffer
state               .BUF       16

// special registers
pKey                .EQU       s1                  // key pointer
pState              .EQU       s2                  // state pointer

// constants
X                   .EQU       0x01
G                   .EQU       0x1B                // 0x11B
b128                .EQU       128 / 8             // 128 bytes of 8 bits

// Rijndael encrypt entry
// plain  is assumed to be in {plain }, the key in {inkey}
// both will be copied, final state will be the result
Encrypt::
                    CALL      InkeyToKey
                    CALL      InToState           // state = in

                    CALL      XorRoundKey         // XorRoundKey( state, k[0], Nc )
                    MOVE      sF, X               // x^(i-1) (i=1)
                    MOVE      s3, 9               // for round = 1 step 1 to Nn - 1
Round:
                    CALL      SubBytes            // ..SubBytes( state, Nc )
                    CALL      ShiftRows           // ..ShiftRows( state, Nc )
                    CALL      MixColumns          // ..MixColumns( state, Nc )
                    CALL      NextRoundKey        // ..XorRoundKey( state, k[ round ], Nc )
                    CALL      XorRoundKey
                    SUB       s3, 1               // ..step 1
                    JUMP      NZ, Round           // end for
                    CALL      SubBytes            // SubBytes( state, Nc )
                    CALL      ShiftRows           // ShiftRows( state, Nc )
                    CALL      NextRoundKey        // XorRoundKey( state, k[ round ], Nc )
                    CALL      XorRoundKey
                    CALL      StateToOut
                    RET                           // result  is last {state}

// result should be: (Gladman or fips197)
// R[10].k_sch d014f9a8c9ee2589e13f0cc8b6630ca6
// R[10].result  3925841d02dc09fbdc118597196a0b32

// XorRoundKey( state, k, Nc )

XorRoundKey:
                    MOVE      pKey, key           // get pointer to key
                    MOVE      pState, state       // get pointer to state

xor128:             MOVE      s0, b128            // set up loop count
xornext:            LD        s4, pKey            // get key byte
                    LD        s5, pState          // get state byte
                    XOR       s4, s5              // do the xor
                    ST        s4, pState          // save new state byte
                    ADD       pKey, 1             // increment key pointer
                    ADD       pState, 1           // increment state pointer
                    SUB       s0, 1               // decrement loop counter
                    JUMP      NZ, xornext         // loop back if not done 16 times (128/8)
                    RET

InToState:
                    MOVE      pKey, plain         // get pointer to plain
                    MOVE      pState, state       // get pointer to state
                    JUMP      ToScratch128

InkeyToKey:
                    MOVE      pKey, inkey         // get pointer to plain
                    MOVE      pState, key         // get pointer to state

ToScratch128:       MOVE      s0, b128            // set up loop count
putnext:            LD        s4, pKey            // get plain  byte
                    ST        s4, pState          // save new state byte
                    ADD       pKey, 1             // increment key pointer
                    ADD       pState, 1           // increment state pointer
                    SUB       s0, 1               // decrement loop counter
                    JUMP      NZ, putnext         // loop back if not done 16 times (128/8)
                    RET

StateToOut:
                    MOVE      pKey, state         // get pointer to state
                    MOVE      pState, result      // get pointer to result

                    MOVE      s0, b128            // set up loop count
getnext:            LD        s4, pKey            // get plain  byte
                    ST        s4, pState          // save new state byte
                    ADD       pKey, 1             // increment key pointer
                    ADD       pState, 1           // increment state pointer
                    SUB       s0, 1               // decrement loop counter
                    JUMP      NZ, getnext         // loop back if not done 16 times (128/8)
                    RET

NextRoundKey:
// temp = k[i - 1]
                    LD        s4, key + 12        // get last word of previous key
                    LD        s5, key + 13
                    LD        s6, key + 14
                    LD        s7, key + 15

                    MOVE      s8, s4              // RotWord
                    MOVE      s4, s5
                    MOVE      s5, s6
                    MOVE      s6, s7
                    MOVE      s7, s8

                    MOVE      s8, s4              // temp=SubWord( RotWord( temp ) )
                    CALL      SBox
                    MOVE      s4, s8

                    XOR       s4, sF              // xor Rcon( i / Nk )
                    SL0       sF                  // x^(i-1) (i+=1)
                    JUMP      NC, nowrap
                    XOR       sF, G
nowrap:
                    MOVE      s8, s5              // SubWord( RotWord( temp ) )
                    CALL      SBox
                    MOVE      s5, s8

                    MOVE      s8, s6              // SubWord( RotWord( temp ) )
                    CALL      SBox
                    MOVE      s6, s8

                    MOVE      s8, s7              // SubWord( RotWord( temp ) )
                    CALL      SBox
                    MOVE      s7, s8

                    MOVE      pKey, key

                    MOVE      s0, b128
key96:              LD        s8, pKey            // k[i]=k[i - Nk] ^ temp
                    XOR       s4, s8
                    ST        s4, pKey
                    ADD       pKey, 1

                    LD        s8, pKey            // k[i]=k[i - Nk] ^ temp
                    XOR       s5, s8
                    ST        s5, pKey
                    ADD       pKey, 1

                    LD        s8, pKey            // k[i]=k[i - Nk] ^ temp
                    XOR       s6, s8
                    ST        s6, pKey
                    ADD       pKey, 1

                    LD        s8, pKey            // k[i]=k[i - Nk] ^ temp
                    XOR       s7, s8
                    ST        s7, pKey
                    ADD       pKey, 1

                    SUB       s0, 4
                    JUMP      NZ, key96
                    RET

// Sub bytes of one 32b word pointed at by pKey
SubWord:
                    MOVE      s0, 4
SubWord1:           LD        s8, pKey
                    CALL      SBox
                    ST        s8, pKey
                    ADD       pKey, 1
                    SUB       s0, 1
                    JUMP      NZ, SubWord1
                    RET

// SubBytes( state, Nc )
SubBytes:
                    MOVE      pState, state       // get pointer to state

                    MOVE      s0, b128            // set up loop count
sub128:             LD        s8, pState          // get state byte
                    CALL      SBox
                    ST        s8, pState          // save new state byte
                    ADD       pState, 1           // increment state pointer
                    SUB       s0, 1               // decrement loop counter
                    JUMP      NZ, sub128          // loop back if not done 16 times (128/8)
                    RET

// SBox( s )
SBox:
                    OUT       s8, SBOX_ROM        // set index
                    IN        s8, SBOX_ROM        // get data
                    RET

// soft version of SBOX, very slow
SBox_Soft:
                    CALL      MulInverse          // .    x = sbox_affine(mul_inverse(in))//
SBoxAffine:
// for(counter = 1; counter <= 4; counter++) {
                    MOVE      s8, s9              // s = in
                    RL        s9                  // s = (s >> (DEGREE - 1)) | (s << 1)// s &= MASK//
                    XOR       s8, s9              // in ^= s
                    RL        s9
                    XOR       s8, s9
                    RL        s9
                    XOR       s8, s9
                    RL        s9
                    XOR       s8, s9
                    XOR       s8, 0x63            // in ^= 0x63
                    RET                           // return in
// }

// MulInverse by trial and error
MulInverse:
                    MOVE      s9, 0               // int result = 0
                    OR        s8, s8              // if (in == 0)
                    RET       Z                   // return 0
MulInverse1:        ADD       s9, 1               // result = 1; result++
                    RET       Z                   // result < MOD
                    MOVE      sC, s8              // in
                    MOVE      sD, s9              // result
                    CALL      GMul                // gmul( in, result, ...)
                    SUB       sE, 1               // == 1
                    JUMP      NZ, MulInverse1     // == 1?
                    RET                           // return result

GMul:
                    MOVE      sE, 0
GMul1:
                    SR0       sD
                    JUMP      C, GMul2            // last bit was 1
                    RET       Z                   // i2 was 0 already ?
                    JUMP      GMul3

GMul2:              XOR       sE, sC
GMul3:              SL0       sC
                    JUMP      NC, GMul1
                    XOR       sC, G               // i1 ^= field
                    JUMP      GMul1

// ShiftRows( state, Nc )
ShiftRows:
                    LD        s7, state + 1
                    LD        s4, state + 1 + 4
                    LD        s5, state + 1 + 4 + 4
                    LD        s6, state + 1 + 4 + 4 + 4
                    ST        s4, state + 1
                    ST        s5, state + 1 + 4
                    ST        s6, state + 1 + 4 + 4
                    ST        s7, state + 1 + 4 + 4 + 4

                    LD        s6, state + 2
                    LD        s7, state + 2 + 4
                    LD        s4, state + 2 + 4 + 4
                    LD        s5, state + 2 + 4 + 4 + 4
                    ST        s4, state + 2
                    ST        s5, state + 2 + 4
                    ST        s6, state + 2 + 4 + 4
                    ST        s7, state + 2 + 4 + 4 + 4

                    LD        s5, state + 3
                    LD        s6, state + 3 + 4
                    LD        s7, state + 3 + 4 + 4
                    LD        s4, state + 3 + 4 + 4 + 4
                    ST        s4, state + 3
                    ST        s5, state + 3 + 4
                    ST        s6, state + 3 + 4 + 4
                    ST        s7, state + 3 + 4 + 4 + 4

                    RET

// MixColumns( state, Nc )
MixColumns:

                    LD        s4, state + 0
                    LD        s5, state + 1
                    LD        s6, state + 2
                    LD        s7, state + 3
                    CALL      MixColumn
                    ST        s4, state + 0
                    ST        s5, state + 1
                    ST        s6, state + 2
                    ST        s7, state + 3

                    LD        s4, state + 0 + 4
                    LD        s5, state + 1 + 4
                    LD        s6, state + 2 + 4
                    LD        s7, state + 3 + 4
                    CALL      MixColumn
                    ST        s4, state + 0 + 4
                    ST        s5, state + 1 + 4
                    ST        s6, state + 2 + 4
                    ST        s7, state + 3 + 4

                    LD        s4, state + 0 + 4 + 4
                    LD        s5, state + 1 + 4 + 4
                    LD        s6, state + 2 + 4 + 4
                    LD        s7, state + 3 + 4 + 4
                    CALL      MixColumn
                    ST        s4, state + 0 + 4 + 4
                    ST        s5, state + 1 + 4 + 4
                    ST        s6, state + 2 + 4 + 4
                    ST        s7, state + 3 + 4 + 4

                    LD        s4, state + 0 + 4 + 4 + 4
                    LD        s5, state + 1 + 4 + 4 + 4
                    LD        s6, state + 2 + 4 + 4 + 4
                    LD        s7, state + 3 + 4 + 4 + 4
                    CALL      MixColumn
                    ST        s4, state + 0 + 4 + 4 + 4
                    ST        s5, state + 1 + 4 + 4 + 4
                    ST        s6, state + 2 + 4 + 4 + 4
                    ST        s7, state + 3 + 4 + 4 + 4

                    RET

MixColumn:
                    MOVE      s9, s4              // t = c[0] ^ c[3]
                    XOR       s9, s7
                    MOVE      sA, s5              // u = c[1] ^ c[2]
                    XOR       sA, s6
                    MOVE      sB, s9              // v = t ^ u
                    XOR       sB, sA

                    MOVE      s8, s4              // c[0] = c[0] ^ v ^ FFmul(0x02, c[0] ^ c[1])
                    XOR       s8, s5
                    SL0       s8
                    JUMP      NC, mcf1
                    XOR       s8, G
mcf1:               XOR       s8, sB
                    XOR       s4, s8

                    MOVE      s8, sA              // c[1] = c[1] ^ v ^ FFmul(0x02, u)
                    SL0       s8
                    JUMP      NC, mcf2
                    XOR       s8, G
mcf2:               XOR       s8, sB
                    XOR       s5, s8

                    MOVE      s8, s6              // c[2] = c[2] ^ v ^ FFmul(0x02, c[2] ^ c[3])
                    XOR       s8, s7
                    SL0       s8
                    JUMP      NC, mcf3
                    XOR       s8, G
mcf3:               XOR       s8, sB
                    XOR       s6, s8

                    MOVE      s8, s9              // c[3] = c[3] ^ v ^ FFmul(0x02, t)
                    SL0       s8
                    JUMP      NC, mcf4
                    XOR       s8, G
mcf4:               XOR       s8, sB
                    XOR       s7, s8

                    RET