float16/types.go at main · zerfoo/float16 · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
package float16

import (
	"fmt"
)

// ErrorCode represents specific error categories for float16 operations
type ErrorCode int

const (
	ErrInvalidOperation ErrorCode = iota
	ErrNaN
	ErrInfinity
	ErrOverflow
	ErrUnderflow
	ErrDivisionByZero
	ErrNotImplemented
)

// Float16Error provides detailed error information for float16 operations
type Float16Error struct {
	Op   string
	Msg  string
	Code ErrorCode
}

func (e *Float16Error) Error() string {
	if e == nil {
		return "<nil>"
	}
	if e.Op != "" {
		return fmt.Sprintf("float16 %s: %s", e.Op, e.Msg)
	}
	return "float16: " + e.Msg
}

// BFloat16Error provides detailed error information for bfloat16 operations
type BFloat16Error struct {
	Op   string
	Msg  string
	Code ErrorCode
}

func (e *BFloat16Error) Error() string {
	if e == nil {
		return "<nil>"
	}
	if e.Op != "" {
		return fmt.Sprintf("bfloat16 %s: %s", e.Op, e.Msg)
	}
	return "bfloat16: " + e.Msg
}

// RoundingMode controls how results are rounded during conversion/arithmetic
type RoundingMode int

const (
	// Round to nearest, ties to even
	RoundNearestEven RoundingMode = iota
	// Round toward zero (truncate)
	RoundTowardZero
	// Round toward +Inf
	RoundTowardPositive
	// Round toward -Inf
	RoundTowardNegative
	// Round to nearest, ties away from zero
	RoundNearestAway
)

// ConversionMode controls error reporting behavior for conversions
type ConversionMode int

const (
	// ModeIEEE performs IEEE-style conversion, saturating to Inf/0 with no errors
	ModeIEEE ConversionMode = iota
	// ModeStrict reports errors for NaN, Inf, overflow, and underflow
	ModeStrict
)

// Float16 represents a 16-bit IEEE 754 half-precision floating-point value
type Float16 uint16

// Bits returns the IEEE 754 half-precision bit pattern of f
func (f Float16) Bits() uint16 { return uint16(f) }

// FromBits constructs a Float16 from its IEEE 754 half-precision bit pattern
func FromBits(b uint16) Float16 { return Float16(b) }

// IEEE 754 half-precision format constants
const (
	SignMask     = 0x8000 // 0b1000000000000000 - Sign bit mask
	ExponentMask = 0x7C00 // 0b0111110000000000 - Exponent bits mask
	MantissaMask = 0x03FF // 0b0000001111111111 - Mantissa bits mask
	MantissaLen  = 10     // Number of mantissa bits
	ExponentLen  = 5      // Number of exponent bits

	// Exponent bias and limits for IEEE 754 half-precision
	// bias = 2^(exponent_bits-1) - 1 = 2^4 - 1 = 15
	ExponentBias = 15 // Bias for 5-bit exponent
	ExponentMax  = 31 // Maximum exponent value (11111 binary)
	ExponentMin  = 0  // Minimum exponent value

	// Normalized exponent range
	ExponentNormalMin = 1  // Minimum normalized exponent
	ExponentNormalMax = 30 // Maximum normalized exponent (infinity at 31)

	// Float32 constants for conversion
	Float32ExponentBias = 127 // IEEE 754 single precision bias
	Float32ExponentLen  = 8   // Float32 exponent bits
	Float32MantissaLen  = 23  // Float32 mantissa bits

	// Special exponent values
	ExponentZero     = 0  // Zero and subnormal numbers
	ExponentInfinity = 31 // Infinity and NaN
)

// Special values following IEEE 754 half-precision standard
const (
	PositiveZero     Float16 = 0x0000 // +0.0
	NegativeZero     Float16 = 0x8000 // -0.0
	PositiveInfinity Float16 = 0x7C00 // +∞
	NegativeInfinity Float16 = 0xFC00 // -∞

	// Largest finite values
	MaxValue Float16 = 0x7BFF // Largest positive finite value (~65504)
	MinValue Float16 = 0xFBFF // Largest negative finite value (~-65504)

	// Smallest normalized positive value
	SmallestNormal Float16 = 0x0400 // 2^-14 ≈ 6.103515625e-05

	// Largest subnormal value
	LargestSubnormal Float16 = 0x03FF // (1023/1024) * 2^-14 ≈ 6.097555161e-05

	// Smallest positive subnormal value
	SmallestSubnormal Float16 = 0x0001 // 2^-24 ≈ 5.960464478e-08

	// Common NaN representations
	QuietNaN     Float16 = 0x7E00 // Quiet NaN (most significant mantissa bit set)
	SignalingNaN Float16 = 0x7D00 // Signaling NaN
	NegativeQNaN Float16 = 0xFE00 // Negative quiet NaN
)

// IsZero returns true if the Float16 value represents zero (positive or negative)
func (f Float16) IsZero() bool {
	return (f & 0x7FFF) == 0
}

// IsInf returns true if the Float16 value represents infinity
// If sign > 0, returns true only for positive infinity
// If sign < 0, returns true only for negative infinity
// If sign == 0, returns true for either infinity
func (f Float16) IsInf(sign int) bool {
	if (f & 0x7FFF) != PositiveInfinity {
		return false
	}
	if sign == 0 {
		return true
	}
	return (sign > 0) == ((f & SignMask) == 0)
}

// IsNaN returns true if the Float16 value represents NaN (Not a Number)
func (f Float16) IsNaN() bool {
	exp := (f & ExponentMask) >> MantissaLen
	mant := f & MantissaMask
	return exp == ExponentInfinity && mant != 0
}

// IsFinite returns true if the Float16 value is finite (not infinity or NaN)
func (f Float16) IsFinite() bool {
	exp := (f & ExponentMask) >> MantissaLen
	return exp != ExponentInfinity
}

// IsNormal returns true if the Float16 value is normalized (not zero, subnormal, infinite, or NaN)
func (f Float16) IsNormal() bool {
	exp := (f & ExponentMask) >> MantissaLen
	return exp != ExponentZero && exp != ExponentInfinity
}

// IsSubnormal returns true if the Float16 value is subnormal (denormalized)
func (f Float16) IsSubnormal() bool {
	exp := (f & ExponentMask) >> MantissaLen
	mant := f & MantissaMask
	return exp == ExponentZero && mant != 0
}

// FloatClass enumerates the IEEE 754 classification of a Float16 value
type FloatClass int

const (
	ClassPositiveZero FloatClass = iota
	ClassNegativeZero
	ClassPositiveSubnormal
	ClassNegativeSubnormal
	ClassPositiveNormal
	ClassNegativeNormal
	ClassPositiveInfinity
	ClassNegativeInfinity
	ClassQuietNaN
	ClassSignalingNaN
)

// Class returns the IEEE 754 classification of the value
func (f Float16) Class() FloatClass {
	bits := uint16(f)
	sign := (bits & SignMask) != 0
	exp := (bits & ExponentMask) >> MantissaLen
	mant := bits & MantissaMask

	switch exp {
	case ExponentZero:
		if mant == 0 {
			if sign {
				return ClassNegativeZero
			}
			return ClassPositiveZero
		}
		if sign {
			return ClassNegativeSubnormal
		}
		return ClassPositiveSubnormal
	case ExponentInfinity:
		if mant == 0 {
			if sign {
				return ClassNegativeInfinity
			}
			return ClassPositiveInfinity
		}
		// NaN: distinguish quiet vs signaling by top mantissa bit (bit 9)
		if (mant & (1 << (MantissaLen - 1))) != 0 {
			return ClassQuietNaN
		}
		return ClassSignalingNaN
	default:
		if sign {
			return ClassNegativeNormal
		}
		return ClassPositiveNormal
	}
}

// Sign returns the sign of the Float16 value: 1 for positive, -1 for negative, 0 for zero
func (f Float16) Sign() int {
	if f.IsZero() {
		return 0
	}
	if (f & SignMask) != 0 {
		return -1
	}
	return 1
}

// Signbit returns true if the Float16 value has a negative sign bit
func (f Float16) Signbit() bool {
	return (f & SignMask) != 0
}

// Abs returns the absolute value of the Float16
func (f Float16) Abs() Float16 {
	return f & 0x7FFF // Clear sign bit
}

// Neg returns the negation of the Float16
func (f Float16) Neg() Float16 {
	return f ^ SignMask // Flip sign bit
}

// CopySign returns a value with the magnitude of f and the sign of s
func (f Float16) CopySign(s Float16) Float16 {
	// Clear sign bit of f, then OR with sign bit of s
	return (f & ^Float16(SignMask)) | (s & Float16(SignMask))
}

// ToInt converts Float16 to int (truncates toward zero)
func (f Float16) ToInt() int {
	return int(f.ToFloat32())
}

// String returns a string representation of the Float16 value
func (f Float16) String() string {
	if f.IsNaN() {
		if f.Signbit() {
			return "-NaN"
		}
		return "NaN"
	}
	if f.IsInf(0) {
		if f.Signbit() {
			return "-Inf"
		}
		return "+Inf"
	}
	return fmt.Sprintf("%.6g", f.ToFloat32())
}

// GoString returns a Go syntax representation of the Float16 value
func (f Float16) GoString() string {
	return fmt.Sprintf("float16.FromBits(0x%04x)", uint16(f))
}

func (f Float16) ToInt32() int32 {
	return int32(f.ToFloat32())
}

// ToInt64 converts Float16 to int64 (truncates toward zero)
func (f Float16) ToInt64() int64 {
	return int64(f.ToFloat32())
}