From b2be5cec88c08293a107375fbc3fdb2be76a53f21f5fc4ee9f45d8e6ebb976eb Mon Sep 17 00:00:00 2001 From: Lixen Wraith Date: Sat, 15 Nov 2025 13:23:18 -0500 Subject: [PATCH] v0.1.2 sanitizer redisigned with policies and rules --- builder.go | 4 +- config.go | 26 ++- constant.go | 6 +- format.go | 130 ++++++++------ format_test.go | 60 ++++--- logger.go | 3 +- sanitizer/sanitizer.go | 262 ++++++++++++++++++---------- sanitizer/sanitizer_test.go | 337 ++++++++++++++++++++---------------- storage_test.go | 6 +- 9 files changed, 496 insertions(+), 338 deletions(-) diff --git a/builder.go b/builder.go index 7321c4d..f91c93e 100644 --- a/builder.go +++ b/builder.go @@ -75,8 +75,8 @@ func (b *Builder) Format(format string) *Builder { } // Sanitization sets the sanitization mode -func (b *Builder) Sanitization(mode sanitizer.Mode) *Builder { - b.cfg.Sanitization = mode +func (b *Builder) Sanitization(policy sanitizer.PolicyPreset) *Builder { + b.cfg.Sanitization = policy return b } diff --git a/config.go b/config.go index aa0961b..3898707 100644 --- a/config.go +++ b/config.go @@ -24,11 +24,11 @@ type Config struct { Extension string `toml:"extension"` // Log file extension // Formatting - Format string `toml:"format"` // "txt", "raw", or "json" - ShowTimestamp bool `toml:"show_timestamp"` // Add timestamp to log records - ShowLevel bool `toml:"show_level"` // Add level to log record - TimestampFormat string `toml:"timestamp_format"` // Time format for log timestamps - Sanitization sanitizer.Mode `toml:"sanitization"` // 0=None, 1=HexEncode, 2=Strip, 3=Escape + Format string `toml:"format"` // "txt", "raw", or "json" + ShowTimestamp bool `toml:"show_timestamp"` // Add timestamp to log records + ShowLevel bool `toml:"show_level"` // Add level to log record + TimestampFormat string `toml:"timestamp_format"` // Time format for log timestamps + Sanitization sanitizer.PolicyPreset `toml:"sanitization"` // "default", "json", "txt", "shell" // Buffer and size limits BufferSize int64 `toml:"buffer_size"` // Channel buffer size @@ -75,7 +75,7 @@ var defaultConfig = Config{ ShowTimestamp: true, ShowLevel: true, TimestampFormat: time.RFC3339Nano, - Sanitization: sanitizer.HexEncode, + Sanitization: sanitizer.PolicyTxt, // Buffer and size limits BufferSize: 1024, @@ -127,9 +127,11 @@ func (c *Config) Validate() error { return fmtErrorf("invalid format: '%s' (use txt, json, or raw)", c.Format) } - // TODO: better bound check, implement validator in `sanitizer` - if c.Sanitization < 0 || c.Sanitization > sanitizer.Escape { - return fmtErrorf("invalid sanitization mode: '%d' (use 0=None, 1=HexEncode, 2=Strip, 3=Escape)", c.Sanitization) + switch c.Sanitization { + case sanitizer.PolicyRaw, sanitizer.PolicyJSON, sanitizer.PolicyTxt, sanitizer.PolicyShell: + // valid policy + default: + return fmtErrorf("invalid sanitization policy: '%s' (use raw, json, txt, or shell)", c.Sanitization) } if strings.HasPrefix(c.Extension, ".") { @@ -226,11 +228,7 @@ func applyConfigField(cfg *Config, key, value string) error { case "timestamp_format": cfg.TimestampFormat = value case "sanitization": - intVal, err := strconv.ParseInt(value, 10, 64) - if err != nil { - return fmtErrorf("invalid integer value for sanitization '%s': %w", value, err) - } - cfg.Sanitization = sanitizer.Mode(intVal) + cfg.Sanitization = sanitizer.PolicyPreset(value) // Buffer and size limits case "buffer_size": diff --git a/constant.go b/constant.go index 977ebfb..bad3b2d 100644 --- a/constant.go +++ b/constant.go @@ -22,9 +22,9 @@ const ( // Record flags for controlling output structure const ( - FlagShowTimestamp int64 = 0b0001 - FlagShowLevel int64 = 0b0010 - FlagRaw int64 = 0b0100 + FlagRaw int64 = 0b0001 + FlagShowTimestamp int64 = 0b0010 + FlagShowLevel int64 = 0b0100 FlagStructuredJSON int64 = 0b1000 FlagDefault = FlagShowTimestamp | FlagShowLevel ) diff --git a/format.go b/format.go index d00c471..bf86c11 100644 --- a/format.go +++ b/format.go @@ -13,27 +13,30 @@ import ( // Formatter manages the buffered writing and formatting of log entries type Formatter struct { - format string - buf []byte - timestampFormat string - sanitizationMode sanitizer.Mode - sanitizer *sanitizer.Sanitizer + format string + buf []byte + timestampFormat string + sanitizer *sanitizer.Sanitizer } // NewFormatter creates a formatter instance -func NewFormatter(format string, bufferSize int64, timestampFormat string, sanitizationMode sanitizer.Mode) *Formatter { +func NewFormatter(format string, bufferSize int64, timestampFormat string, sanitizationPolicy sanitizer.PolicyPreset) *Formatter { if timestampFormat == "" { timestampFormat = time.RFC3339Nano } if format == "" { format = "txt" } + if sanitizationPolicy == "" { + sanitizationPolicy = "raw" + } + + s := (sanitizer.New()).Policy(sanitizationPolicy) return &Formatter{ - format: format, - buf: make([]byte, 0, bufferSize), - timestampFormat: timestampFormat, - sanitizationMode: sanitizationMode, - sanitizer: sanitizer.New(sanitizationMode), + format: format, + buf: make([]byte, 0, bufferSize), + timestampFormat: timestampFormat, + sanitizer: s, } } @@ -46,31 +49,45 @@ func (f *Formatter) Reset() { func (f *Formatter) Format(format string, flags int64, timestamp time.Time, level int64, trace string, args []any) []byte { f.Reset() - // The FlagRaw acts as an override to the configured format - effectiveFormat := format + // FlagRaw completely bypasses formatting and sanitization if flags&FlagRaw != 0 { - effectiveFormat = "raw" + for i, arg := range args { + if i > 0 { + f.buf = append(f.buf, ' ') + } + // Direct conversion without sanitization + switch v := arg.(type) { + case string: + f.buf = append(f.buf, v...) + case []byte: + f.buf = append(f.buf, v...) + case fmt.Stringer: + f.buf = append(f.buf, v.String()...) + case error: + f.buf = append(f.buf, v.Error()...) + default: + f.buf = append(f.buf, fmt.Sprint(v)...) + } + } + return f.buf } - // Create the handler based on the effective format - handler := sanitizer.NewUnifiedHandler(effectiveFormat, f.sanitizer) + // Create the serializer based on the effective format + serializer := sanitizer.NewSerializer(format, f.sanitizer) - switch effectiveFormat { + switch format { case "raw": - // This dedicated path handles both format="raw" and FlagRaw - // It only serializes the arguments and adds NO metadata or newlines + // Raw formatting serializes the arguments and adds NO metadata or newlines for i, arg := range args { - f.convertValue(&f.buf, arg, handler, i > 0) + f.convertValue(&f.buf, arg, serializer, i > 0) } return f.buf case "json": - // The existing JSON serialization logic remains unchanged - return f.formatJSON(flags, timestamp, level, trace, args, handler) + return f.formatJSON(flags, timestamp, level, trace, args, serializer) case "txt": - // The existing Txt serialization logic is now correctly isolated - return f.formatTxt(flags, timestamp, level, trace, args, handler) + return f.formatTxt(flags, timestamp, level, trace, args, serializer) } return nil // forcing panic on unrecognized format @@ -79,86 +96,86 @@ func (f *Formatter) Format(format string, flags int64, timestamp time.Time, leve // FormatValue formats a single value according to the formatter's configuration func (f *Formatter) FormatValue(v any) []byte { f.Reset() - handler := sanitizer.NewUnifiedHandler(f.format, f.sanitizer) - f.convertValue(&f.buf, v, handler, false) + serializer := sanitizer.NewSerializer(f.format, f.sanitizer) + f.convertValue(&f.buf, v, serializer, false) return f.buf } // FormatArgs formats multiple arguments as space-separated values func (f *Formatter) FormatArgs(args ...any) []byte { f.Reset() - handler := sanitizer.NewUnifiedHandler(f.format, f.sanitizer) + serializer := sanitizer.NewSerializer(f.format, f.sanitizer) for i, arg := range args { - f.convertValue(&f.buf, arg, handler, i > 0) + f.convertValue(&f.buf, arg, serializer, i > 0) } return f.buf } // convertValue provides unified type conversion -func (f *Formatter) convertValue(buf *[]byte, v any, handler *sanitizer.UnifiedHandler, needsSpace bool) { +func (f *Formatter) convertValue(buf *[]byte, v any, serializer *sanitizer.Serializer, needsSpace bool) { if needsSpace && len(*buf) > 0 { *buf = append(*buf, ' ') } switch val := v.(type) { case string: - handler.WriteString(buf, val) + serializer.WriteString(buf, val) case []byte: - handler.WriteString(buf, string(val)) + serializer.WriteString(buf, string(val)) case rune: var runeStr [utf8.UTFMax]byte n := utf8.EncodeRune(runeStr[:], val) - handler.WriteString(buf, string(runeStr[:n])) + serializer.WriteString(buf, string(runeStr[:n])) case int: num := strconv.AppendInt(nil, int64(val), 10) - handler.WriteNumber(buf, string(num)) + serializer.WriteNumber(buf, string(num)) case int64: num := strconv.AppendInt(nil, val, 10) - handler.WriteNumber(buf, string(num)) + serializer.WriteNumber(buf, string(num)) case uint: num := strconv.AppendUint(nil, uint64(val), 10) - handler.WriteNumber(buf, string(num)) + serializer.WriteNumber(buf, string(num)) case uint64: num := strconv.AppendUint(nil, val, 10) - handler.WriteNumber(buf, string(num)) + serializer.WriteNumber(buf, string(num)) case float32: num := strconv.AppendFloat(nil, float64(val), 'f', -1, 32) - handler.WriteNumber(buf, string(num)) + serializer.WriteNumber(buf, string(num)) case float64: num := strconv.AppendFloat(nil, val, 'f', -1, 64) - handler.WriteNumber(buf, string(num)) + serializer.WriteNumber(buf, string(num)) case bool: - handler.WriteBool(buf, val) + serializer.WriteBool(buf, val) case nil: - handler.WriteNil(buf) + serializer.WriteNil(buf) case time.Time: timeStr := val.Format(f.timestampFormat) - handler.WriteString(buf, timeStr) + serializer.WriteString(buf, timeStr) case error: - handler.WriteString(buf, val.Error()) + serializer.WriteString(buf, val.Error()) case fmt.Stringer: - handler.WriteString(buf, val.String()) + serializer.WriteString(buf, val.String()) default: - handler.WriteComplex(buf, val) + serializer.WriteComplex(buf, val) } } // formatJSON unifies JSON output -func (f *Formatter) formatJSON(flags int64, timestamp time.Time, level int64, trace string, args []any, handler *sanitizer.UnifiedHandler) []byte { +func (f *Formatter) formatJSON(flags int64, timestamp time.Time, level int64, trace string, args []any, serializer *sanitizer.Serializer) []byte { f.buf = append(f.buf, '{') needsComma := false @@ -184,7 +201,7 @@ func (f *Formatter) formatJSON(flags int64, timestamp time.Time, level int64, tr f.buf = append(f.buf, ',') } f.buf = append(f.buf, `"trace":`...) - handler.WriteString(&f.buf, trace) + serializer.WriteString(&f.buf, trace) needsComma = true } @@ -196,7 +213,7 @@ func (f *Formatter) formatJSON(flags int64, timestamp time.Time, level int64, tr f.buf = append(f.buf, ',') } f.buf = append(f.buf, `"message":`...) - handler.WriteString(&f.buf, message) + serializer.WriteString(&f.buf, message) f.buf = append(f.buf, ',') f.buf = append(f.buf, `"fields":`...) @@ -204,7 +221,7 @@ func (f *Formatter) formatJSON(flags int64, timestamp time.Time, level int64, tr marshaledFields, err := json.Marshal(fields) if err != nil { f.buf = append(f.buf, `{"_marshal_error":"`...) - handler.WriteString(&f.buf, err.Error()) + serializer.WriteString(&f.buf, err.Error()) f.buf = append(f.buf, `"}`...) } else { f.buf = append(f.buf, marshaledFields...) @@ -226,7 +243,7 @@ func (f *Formatter) formatJSON(flags int64, timestamp time.Time, level int64, tr if i > 0 { f.buf = append(f.buf, ',') } - f.convertValue(&f.buf, arg, handler, false) + f.convertValue(&f.buf, arg, serializer, false) } f.buf = append(f.buf, ']') } @@ -236,7 +253,7 @@ func (f *Formatter) formatJSON(flags int64, timestamp time.Time, level int64, tr } // formatTxt handles txt format output -func (f *Formatter) formatTxt(flags int64, timestamp time.Time, level int64, trace string, args []any, handler *sanitizer.UnifiedHandler) []byte { +func (f *Formatter) formatTxt(flags int64, timestamp time.Time, level int64, trace string, args []any, serializer *sanitizer.Serializer) []byte { needsSpace := false if flags&FlagShowTimestamp != 0 { @@ -256,12 +273,21 @@ func (f *Formatter) formatTxt(flags int64, timestamp time.Time, level int64, tra if needsSpace { f.buf = append(f.buf, ' ') } - f.buf = append(f.buf, trace...) + // Sanitize trace to prevent terminal control sequence injection + traceHandler := sanitizer.NewSerializer("txt", f.sanitizer) + tempBuf := make([]byte, 0, len(trace)*2) + traceHandler.WriteString(&tempBuf, trace) + // Extract content without quotes if added by txt serializer + if len(tempBuf) > 2 && tempBuf[0] == '"' && tempBuf[len(tempBuf)-1] == '"' { + f.buf = append(f.buf, tempBuf[1:len(tempBuf)-1]...) + } else { + f.buf = append(f.buf, tempBuf...) + } needsSpace = true } for _, arg := range args { - f.convertValue(&f.buf, arg, handler, needsSpace) + f.convertValue(&f.buf, arg, serializer, needsSpace) needsSpace = true } diff --git a/format_test.go b/format_test.go index e499d40..9c17e46 100644 --- a/format_test.go +++ b/format_test.go @@ -10,13 +10,14 @@ import ( "testing" "time" + "github.com/lixenwraith/log/sanitizer" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" ) // TestFormatter tests the output of the formatter for txt, json, and raw formats func TestFormatter(t *testing.T) { - f := NewFormatter("txt", 1024, time.RFC3339Nano, 0) + f := NewFormatter("txt", 1024, time.RFC3339Nano, sanitizer.PolicyRaw) timestamp := time.Date(2024, 1, 1, 12, 0, 0, 0, time.UTC) t.Run("txt format", func(t *testing.T) { @@ -30,7 +31,7 @@ func TestFormatter(t *testing.T) { assert.True(t, strings.HasSuffix(str, "\n")) }) - f = NewFormatter("json", 1024, time.RFC3339Nano, 0) + f = NewFormatter("json", 1024, time.RFC3339Nano, sanitizer.PolicyRaw) t.Run("json format", func(t *testing.T) { data := f.Format("json", FlagDefault, timestamp, LevelWarn, "trace1", []any{"warning", true}) @@ -45,7 +46,7 @@ func TestFormatter(t *testing.T) { assert.Equal(t, true, fields[1]) }) - f = NewFormatter("raw", 1024, time.RFC3339Nano, 0) + f = NewFormatter("raw", 1024, time.RFC3339Nano, sanitizer.PolicyRaw) t.Run("raw format", func(t *testing.T) { data := f.Format("raw", 0, timestamp, LevelInfo, "", []any{"raw", "data", 42}) str := string(data) @@ -61,7 +62,7 @@ func TestFormatter(t *testing.T) { assert.Equal(t, "forced raw", str) }) - f = NewFormatter("json", 1024, time.RFC3339Nano, 0) + f = NewFormatter("json", 1024, time.RFC3339Nano, sanitizer.PolicyJSON) t.Run("structured json", func(t *testing.T) { fields := map[string]any{"key1": "value1", "key2": 42} data := f.Format("json", FlagStructuredJSON|FlagDefault, timestamp, LevelInfo, "", @@ -75,7 +76,7 @@ func TestFormatter(t *testing.T) { assert.Equal(t, map[string]any{"key1": "value1", "key2": float64(42)}, result["fields"]) }) - f = NewFormatter("json", 1024, time.RFC3339Nano, 3) + f = NewFormatter("json", 1024, time.RFC3339Nano, sanitizer.PolicyJSON) t.Run("special characters escaping", func(t *testing.T) { data := f.Format("json", FlagDefault, timestamp, LevelInfo, "", []any{"test\n\r\t\"\\message"}) @@ -121,33 +122,42 @@ func TestControlCharacterWrite(t *testing.T) { logger, tmpDir := createTestLogger(t) defer logger.Shutdown() - // Test various control characters + cfg := logger.GetConfig() + cfg.Format = "raw" + cfg.ShowTimestamp = false + cfg.ShowLevel = false + err := logger.ApplyConfig(cfg) + require.NoError(t, err) + + // Test various control characters with expected sanitized output testCases := []struct { - name string - input string + name string + input string + expected string }{ - {"null bytes", "test\x00data"}, - {"bell", "alert\x07message"}, - {"backspace", "back\x08space"}, - {"form feed", "page\x0Cbreak"}, - {"vertical tab", "vertical\x0Btab"}, - {"escape", "escape\x1B[31mcolor"}, - {"mixed", "\x00\x01\x02test\x1F\x7Fdata"}, + {"null bytes", "test\x00data", "test<00>data"}, + {"bell", "alert\x07message", "alert<07>message"}, + {"backspace", "back\x08space", "back<08>space"}, + {"form feed", "page\x0Cbreak", "page<0c>break"}, + {"vertical tab", "vertical\x0Btab", "vertical<0b>tab"}, + {"escape", "escape\x1B[31mcolor", "escape<1b>[31mcolor"}, + {"mixed", "\x00\x01\x02test\x1F\x7Fdata", "<00><01><02>test<1f><7f>data"}, } for _, tc := range testCases { - logger.Write(tc.input) + logger.Message(tc.input) } logger.Flush(time.Second) - // Verify file contains hex-encoded control chars content, err := os.ReadFile(filepath.Join(tmpDir, "log.log")) require.NoError(t, err) - // Control chars should be hex-encoded in raw output - assert.Contains(t, string(content), "test") - assert.Contains(t, string(content), "data") + // Verify each test case produced correct sanitized output + for _, tc := range testCases { + assert.Contains(t, string(content), tc.expected, + "Test case '%s' should produce hex-encoded control chars", tc.name) + } } // TestRawSanitizedOutput verifies that raw output is correctly sanitized @@ -155,6 +165,14 @@ func TestRawSanitizedOutput(t *testing.T) { logger, tmpDir := createTestLogger(t) defer logger.Shutdown() + // Use raw format instead of Write() to test sanitization + cfg := logger.GetConfig() + cfg.Format = "raw" + cfg.ShowTimestamp = false + cfg.ShowLevel = false + err := logger.ApplyConfig(cfg) + require.NoError(t, err) + // 1. A string with valid multi-byte UTF-8 should be unchanged utf8String := "Hello │ 世界" @@ -171,7 +189,7 @@ func TestRawSanitizedOutput(t *testing.T) { expectedMultiByteOutput := "line1line2" // Log all cases - logger.Write(utf8String, stringWithControl, bytesWithControl, multiByteControl) + logger.Message(utf8String, stringWithControl, bytesWithControl, multiByteControl) logger.Flush(time.Second) // Read and verify the single line of output diff --git a/logger.go b/logger.go index 3de74ca..c85f373 100644 --- a/logger.go +++ b/logger.go @@ -326,8 +326,7 @@ func (l *Logger) LogStructured(level int64, message string, fields map[string]an l.log(l.getFlags()|FlagStructuredJSON, level, 0, []any{message, fields}) } -// Write outputs raw, unformatted data regardless of configured format -// Writes args as space-separated strings without a trailing newline +// Write outputs raw, unformatted data ignoring configured format and sanitization without trailing new line func (l *Logger) Write(args ...any) { l.log(FlagRaw, LevelInfo, 0, args...) } diff --git a/sanitizer/sanitizer.go b/sanitizer/sanitizer.go index df47316..5f02482 100644 --- a/sanitizer/sanitizer.go +++ b/sanitizer/sanitizer.go @@ -1,4 +1,6 @@ // FILE: lixenwraith/log/sanitizer/sanitizer.go +// Package sanitizer provides a fluent and composable interface for sanitizing +// strings based on configurable rules using bitwise filter flags and transforms. package sanitizer import ( @@ -12,105 +14,186 @@ import ( "github.com/davecgh/go-spew/spew" ) -// Mode controls how non-printable characters are handled -type Mode int - -// Sanitization modes +// Filter flags for character matching const ( - None Mode = iota // No sanitization - HexEncode // Encode as (current default) - Strip // Remove control characters - Escape // JSON-style escaping + FilterNonPrintable uint64 = 1 << iota // Matches runes not classified as printable by strconv.IsPrint + FilterControl // Matches control characters (unicode.IsControl) + FilterWhitespace // Matches whitespace characters (unicode.IsSpace) + FilterShellSpecial // Matches common shell metacharacters: '`', '$', ';', '|', '&', '>', '<', '(', ')', '#' ) -// Sanitizer provides centralized sanitization logic -type Sanitizer struct { - mode Mode - buf []byte // Reusable buffer +// Transform flags for character transformation +const ( + TransformStrip uint64 = 1 << iota // Removes the character + TransformHexEncode // Encodes the character's UTF-8 bytes as "" + TransformJSONEscape // Escapes the character with JSON-style backslashes (e.g., '\n', '\u0000') +) + +// PolicyPreset defines pre-configured sanitization policies +type PolicyPreset string + +const ( + PolicyRaw PolicyPreset = "raw" // Default is a no-op (passthrough) + PolicyJSON PolicyPreset = "json" // Policy for sanitizing strings to be embedded in JSON + PolicyTxt PolicyPreset = "txt" // Policy for sanitizing text written to log files + PolicyShell PolicyPreset = "shell" // Policy for sanitizing arguments passed to shell commands +) + +// rule represents a single sanitization rule +type rule struct { + filter uint64 + transform uint64 } -func New(mode Mode) *Sanitizer { - return &Sanitizer{ - mode: mode, - buf: make([]byte, 0, 256), - } +// policyRules contains pre-configured rules for each policy +var policyRules = map[PolicyPreset][]rule{ + PolicyRaw: {}, + PolicyTxt: {{filter: FilterNonPrintable, transform: TransformHexEncode}}, + PolicyJSON: {{filter: FilterControl, transform: TransformJSONEscape}}, + PolicyShell: {{filter: FilterShellSpecial | FilterWhitespace, transform: TransformStrip}}, } -func (s *Sanitizer) Reset() { - s.buf = s.buf[:0] -} - -func (s *Sanitizer) Sanitize(data string) string { - if s.mode == None { - return data - } - - s.Reset() - - for _, r := range data { - if strconv.IsPrint(r) { - s.buf = utf8.AppendRune(s.buf, r) - continue +// filterCheckers maps individual filter flags to their check functions +var filterCheckers = map[uint64]func(rune) bool{ + FilterNonPrintable: func(r rune) bool { return !strconv.IsPrint(r) }, + FilterControl: unicode.IsControl, + FilterWhitespace: unicode.IsSpace, + FilterShellSpecial: func(r rune) bool { + switch r { + case '`', '$', ';', '|', '&', '>', '<', '(', ')', '#': + return true } + return false + }, +} - switch s.mode { - case HexEncode: - var runeBytes [utf8.UTFMax]byte - n := utf8.EncodeRune(runeBytes[:], r) - s.buf = append(s.buf, '<') - s.buf = append(s.buf, hex.EncodeToString(runeBytes[:n])...) - s.buf = append(s.buf, '>') +// Sanitizer provides chainable text sanitization +type Sanitizer struct { + rules []rule + buf []byte +} - case Strip: - // Skip non-printable - continue +// New creates a new Sanitizer instance +func New() *Sanitizer { + return &Sanitizer{ + rules: []rule{}, + buf: make([]byte, 0, 256), + } +} - case Escape: - switch r { - case '\n': - s.buf = append(s.buf, '\\', 'n') - case '\r': - s.buf = append(s.buf, '\\', 'r') - case '\t': - s.buf = append(s.buf, '\\', 't') - case '\b': - s.buf = append(s.buf, '\\', 'b') - case '\f': - s.buf = append(s.buf, '\\', 'f') - default: - // Unicode escape for other control chars - s.buf = append(s.buf, '\\', 'u') - s.buf = append(s.buf, fmt.Sprintf("%04x", r)...) +// Rule adds a custom rule to the sanitizer (prepended for precedence) +func (s *Sanitizer) Rule(filter uint64, transform uint64) *Sanitizer { + // Append rule in natural order + s.rules = append(s.rules, rule{filter: filter, transform: transform}) + return s +} + +// Policy applies a pre-configured policy to the sanitizer (appended) +func (s *Sanitizer) Policy(preset PolicyPreset) *Sanitizer { + if rules, ok := policyRules[preset]; ok { + s.rules = append(s.rules, rules...) + } + return s +} + +// Sanitize applies all configured rules to the input string +func (s *Sanitizer) Sanitize(data string) string { + // Reset buffer + s.buf = s.buf[:0] + + // Process each rune + for _, r := range data { + matched := false + // Check rules in order (first match wins) + for _, rl := range s.rules { + if matchesFilter(r, rl.filter) { + applyTransform(&s.buf, r, rl.transform) + matched = true + break } } + // If no rule matched, append original rune + if !matched { + s.buf = utf8.AppendRune(s.buf, r) + } } return string(s.buf) } -// UnifiedHandler implements all format behaviors in a single struct -type UnifiedHandler struct { +// matchesFilter checks if a rune matches any filter in the mask +func matchesFilter(r rune, filterMask uint64) bool { + for flag, checker := range filterCheckers { + if (filterMask&flag) != 0 && checker(r) { + return true + } + } + return false +} + +// applyTransform applies the specified transform to the buffer +func applyTransform(buf *[]byte, r rune, transformMask uint64) { + switch { + case (transformMask & TransformStrip) != 0: + // Do nothing (strip) + + case (transformMask & TransformHexEncode) != 0: + var runeBytes [utf8.UTFMax]byte + n := utf8.EncodeRune(runeBytes[:], r) + *buf = append(*buf, '<') + *buf = append(*buf, hex.EncodeToString(runeBytes[:n])...) + *buf = append(*buf, '>') + + case (transformMask & TransformJSONEscape) != 0: + switch r { + case '\n': + *buf = append(*buf, '\\', 'n') + case '\r': + *buf = append(*buf, '\\', 'r') + case '\t': + *buf = append(*buf, '\\', 't') + case '\b': + *buf = append(*buf, '\\', 'b') + case '\f': + *buf = append(*buf, '\\', 'f') + case '"': + *buf = append(*buf, '\\', '"') + case '\\': + *buf = append(*buf, '\\', '\\') + default: + if r < 0x20 || r == 0x7f { + *buf = append(*buf, fmt.Sprintf("\\u%04x", r)...) + } else { + *buf = utf8.AppendRune(*buf, r) + } + } + } +} + +// Serializer implements format-specific output behaviors +type Serializer struct { format string sanitizer *Sanitizer } -func NewUnifiedHandler(format string, san *Sanitizer) *UnifiedHandler { - return &UnifiedHandler{ +// NewSerializer creates a handler with format-specific behavior +func NewSerializer(format string, san *Sanitizer) *Serializer { + return &Serializer{ format: format, sanitizer: san, } } -func (h *UnifiedHandler) WriteString(buf *[]byte, s string) { - switch h.format { +// WriteString writes a string with format-specific handling +func (se *Serializer) WriteString(buf *[]byte, s string) { + switch se.format { case "raw": - *buf = append(*buf, h.sanitizer.Sanitize(s)...) + *buf = append(*buf, se.sanitizer.Sanitize(s)...) case "txt": - sanitized := h.sanitizer.Sanitize(s) - if h.NeedsQuotes(sanitized) { + sanitized := se.sanitizer.Sanitize(s) + if se.NeedsQuotes(sanitized) { *buf = append(*buf, '"') - // Escape quotes within quoted strings for i := 0; i < len(sanitized); i++ { if sanitized[i] == '"' || sanitized[i] == '\\' { *buf = append(*buf, '\\') @@ -124,12 +207,12 @@ func (h *UnifiedHandler) WriteString(buf *[]byte, s string) { case "json": *buf = append(*buf, '"') - // Direct JSON escaping without pre-sanitization + // Direct JSON escaping for i := 0; i < len(s); { c := s[i] - if c >= ' ' && c != '"' && c != '\\' { + if c >= ' ' && c != '"' && c != '\\' && c < 0x7f { start := i - for i < len(s) && s[i] >= ' ' && s[i] != '"' && s[i] != '\\' { + for i < len(s) && s[i] >= ' ' && s[i] != '"' && s[i] != '\\' && s[i] < 0x7f { i++ } *buf = append(*buf, s[start:i]...) @@ -157,27 +240,30 @@ func (h *UnifiedHandler) WriteString(buf *[]byte, s string) { } } -func (h *UnifiedHandler) WriteNumber(buf *[]byte, n string) { +// WriteNumber writes a number value +func (se *Serializer) WriteNumber(buf *[]byte, n string) { *buf = append(*buf, n...) } -func (h *UnifiedHandler) WriteBool(buf *[]byte, b bool) { +// WriteBool writes a boolean value +func (se *Serializer) WriteBool(buf *[]byte, b bool) { *buf = strconv.AppendBool(*buf, b) } -func (h *UnifiedHandler) WriteNil(buf *[]byte) { - switch h.format { +// WriteNil writes a nil value +func (se *Serializer) WriteNil(buf *[]byte) { + switch se.format { case "raw": *buf = append(*buf, "nil"...) - default: // txt, json + default: *buf = append(*buf, "null"...) } } -func (h *UnifiedHandler) WriteComplex(buf *[]byte, v any) { - switch h.format { +// WriteComplex writes complex types +func (se *Serializer) WriteComplex(buf *[]byte, v any) { + switch se.format { case "raw": - // Use spew for complex types in raw mode, DEBUG use var b bytes.Buffer dumper := &spew.ConfigState{ Indent: " ", @@ -189,41 +275,37 @@ func (h *UnifiedHandler) WriteComplex(buf *[]byte, v any) { dumper.Fdump(&b, v) *buf = append(*buf, bytes.TrimSpace(b.Bytes())...) - default: // txt, json + default: str := fmt.Sprintf("%+v", v) - h.WriteString(buf, str) + se.WriteString(buf, str) } } -func (h *UnifiedHandler) NeedsQuotes(s string) bool { - switch h.format { +// NeedsQuotes determines if quoting is needed +func (se *Serializer) NeedsQuotes(s string) bool { + switch se.format { case "json": - return true // JSON always quotes + return true case "txt": - // Quote strings that: - // 1. Are empty if len(s) == 0 { return true } for _, r := range s { - // 2. Contain whitespace (space, tab, newline, etc.) if unicode.IsSpace(r) { return true } - // 3. Contain shell special characters (POSIX + common extensions) switch r { case '"', '\'', '\\', '$', '`', '!', '&', '|', ';', '(', ')', '<', '>', '*', '?', '[', ']', '{', '}', '~', '#', '%', '=', '\n', '\r', '\t': return true } - // 4. Non-print if !unicode.IsPrint(r) { return true } } return false - default: // raw + default: return false } } \ No newline at end of file diff --git a/sanitizer/sanitizer_test.go b/sanitizer/sanitizer_test.go index 8fd857c..3168ca5 100644 --- a/sanitizer/sanitizer_test.go +++ b/sanitizer/sanitizer_test.go @@ -8,199 +8,234 @@ import ( "github.com/stretchr/testify/assert" ) -func TestSanitizer(t *testing.T) { - testCases := []struct { - name string - input string - mode Mode - expected string - }{ - // None mode tests - { - name: "none mode passes through", - input: "hello\x00world\n", - mode: None, - expected: "hello\x00world\n", - }, - - // HexEncode tests - { - name: "hex encode null byte", - input: "test\x00data", - mode: HexEncode, - expected: "test<00>data", - }, - { - name: "hex encode control chars", - input: "bell\x07tab\x09form\x0c", - mode: HexEncode, - expected: "bell<07>tab<09>form<0c>", - }, - { - name: "hex encode preserves printable", - input: "Hello World 123!@#", - mode: HexEncode, - expected: "Hello World 123!@#", - }, - { - name: "hex encode multi-byte control", - input: "line1\u0085line2", // NEXT LINE (C2 85) - mode: HexEncode, - expected: "line1line2", - }, - { - name: "hex encode preserves UTF-8", - input: "Hello 世界 ✓", - mode: HexEncode, - expected: "Hello 世界 ✓", - }, - - // Strip tests - { - name: "strip removes control chars", - input: "clean\x00\x07\ntxt", - mode: Strip, - expected: "cleantxt", - }, - { - name: "strip preserves spaces", - input: "hello world", - mode: Strip, - expected: "hello world", - }, - - // Escape tests - { - name: "escape common control chars", - input: "line1\nline2\ttab\rreturn", - mode: Escape, - expected: "line1\\nline2\\ttab\\rreturn", - }, - { - name: "escape unicode control", - input: "text\x01\x1f", - mode: Escape, - expected: "text\\u0001\\u001f", - }, - { - name: "escape backspace and form feed", - input: "back\bspace form\ffeed", - mode: Escape, - expected: "back\\bspace form\\ffeed", - }, - } - - for _, tc := range testCases { - t.Run(tc.name, func(t *testing.T) { - s := New(tc.mode) - result := s.Sanitize(tc.input) - assert.Equal(t, tc.expected, result) - }) - } +func TestNewSanitizer(t *testing.T) { + // Default passthrough behavior + s := New() + input := "abc\x00xyz" + assert.Equal(t, input, s.Sanitize(input), "default sanitizer should pass through all characters") } -func TestUnifiedHandler(t *testing.T) { - t.Run("raw format", func(t *testing.T) { - san := New(HexEncode) - handler := NewUnifiedHandler("raw", san) - - var buf []byte - - // String handling - handler.WriteString(&buf, "test\x00data") - assert.Equal(t, "test<00>data", string(buf)) - - // Nil handling - buf = nil - handler.WriteNil(&buf) - assert.Equal(t, "nil", string(buf)) - - // No quotes needed - assert.False(t, handler.NeedsQuotes("any string")) +func TestSingleRule(t *testing.T) { + t.Run("strip non-printable", func(t *testing.T) { + s := New().Rule(FilterNonPrintable, TransformStrip) + assert.Equal(t, "ab", s.Sanitize("a\x00b")) + assert.Equal(t, "test", s.Sanitize("test\x01\x02\x03")) }) - t.Run("txt format", func(t *testing.T) { - san := New(HexEncode) - handler := NewUnifiedHandler("txt", san) + t.Run("hex encode non-printable", func(t *testing.T) { + s := New().Rule(FilterNonPrintable, TransformHexEncode) + assert.Equal(t, "a<00>b", s.Sanitize("a\x00b")) + assert.Equal(t, "bell<07>tab<09>", s.Sanitize("bell\x07tab\x09")) + }) + + t.Run("JSON escape control", func(t *testing.T) { + s := New().Rule(FilterControl, TransformJSONEscape) + assert.Equal(t, "line1\\nline2", s.Sanitize("line1\nline2")) + assert.Equal(t, "tab\\there", s.Sanitize("tab\there")) + assert.Equal(t, "null\\u0000byte", s.Sanitize("null\x00byte")) + }) + + t.Run("strip whitespace", func(t *testing.T) { + s := New().Rule(FilterWhitespace, TransformStrip) + assert.Equal(t, "nospaceshere", s.Sanitize("no spaces here")) + assert.Equal(t, "tabsgone", s.Sanitize("tabs\t\tgone")) + }) + + t.Run("strip shell special", func(t *testing.T) { + s := New().Rule(FilterShellSpecial, TransformStrip) + assert.Equal(t, "cmd echo test", s.Sanitize("cmd; echo test")) + assert.Equal(t, "no pipes", s.Sanitize("no | pipes")) + assert.Equal(t, "var", s.Sanitize("$var")) + }) +} + +func TestPolicy(t *testing.T) { + t.Run("PolicyTxt", func(t *testing.T) { + s := New().Policy(PolicyTxt) + assert.Equal(t, "hello<07>world", s.Sanitize("hello\x07world")) + assert.Equal(t, "clean text", s.Sanitize("clean text")) + }) + + t.Run("PolicyJSON", func(t *testing.T) { + s := New().Policy(PolicyJSON) + assert.Equal(t, "line1\\nline2", s.Sanitize("line1\nline2")) + assert.Equal(t, "\\ttab", s.Sanitize("\ttab")) + }) + + t.Run("PolicyShellArg", func(t *testing.T) { + s := New().Policy(PolicyShell) + assert.Equal(t, "cmdecho", s.Sanitize("cmd; echo")) + assert.Equal(t, "nospaces", s.Sanitize("no spaces")) + }) +} + +func TestRulePrecedence(t *testing.T) { + // With append + forward iteration: Policy is checked before Rule + s := New().Policy(PolicyTxt).Rule(FilterControl, TransformStrip) + + // \x07 is both control AND non-printable - matches PolicyTxt first + // \x00 is both control AND non-printable - matches PolicyTxt first + input := "a\x07b\x00c" + expected := "a<07>b<00>c" // FIXED: Policy wins now + result := s.Sanitize(input) + + assert.Equal(t, expected, result, + "Policy() is now checked before Rule() - non-printable chars get hex encoded") +} + +func TestCompositeFilter(t *testing.T) { + s := New().Rule(FilterShellSpecial|FilterWhitespace, TransformStrip) + assert.Equal(t, "cmdechohello", s.Sanitize("cmd; echo hello")) + assert.Equal(t, "nopipesnospaces", s.Sanitize("no |pipes| no spaces")) +} + +func TestChaining(t *testing.T) { + s := New(). + Rule(FilterWhitespace, TransformStrip). + Rule(FilterShellSpecial, TransformHexEncode) + + // Shell special chars are checked first (prepended), get hex encoded + // Whitespace rule is second, strips spaces + assert.Equal(t, "cmd<3b>echohello", s.Sanitize("cmd; echo hello")) +} + +func TestMultipleRulesOrder(t *testing.T) { + // Test that first matching rule wins + s := New(). + Rule(FilterControl, TransformStrip). + Rule(FilterControl, TransformHexEncode) // This should never match + + assert.Equal(t, "ab", s.Sanitize("a\x00b"), "first rule should win") +} + +func TestEdgeCases(t *testing.T) { + t.Run("empty string", func(t *testing.T) { + s := New().Rule(FilterNonPrintable, TransformStrip) + assert.Equal(t, "", s.Sanitize("")) + }) + + t.Run("only sanitizable characters", func(t *testing.T) { + s := New().Rule(FilterNonPrintable, TransformStrip) + assert.Equal(t, "", s.Sanitize("\x00\x01\x02\x03")) + }) + + t.Run("multi-byte UTF-8", func(t *testing.T) { + s := New().Rule(FilterNonPrintable, TransformHexEncode) + input := "Hello 世界 ✓" + assert.Equal(t, input, s.Sanitize(input), "UTF-8 should pass through") + }) + + t.Run("multi-byte control character", func(t *testing.T) { + s := New().Rule(FilterNonPrintable, TransformHexEncode) + // NEL (Next Line) is U+0085, encoded as C2 85 in UTF-8 + assert.Equal(t, "line1line2", s.Sanitize("line1\u0085line2")) + }) +} + +func TestSerializer(t *testing.T) { + t.Run("raw format with sanitizer", func(t *testing.T) { + san := New().Rule(FilterNonPrintable, TransformHexEncode) + handler := NewSerializer("raw", san) var buf []byte + handler.WriteString(&buf, "test\x00data") + assert.Equal(t, "test<00>data", string(buf)) + }) - // String with spaces gets quoted + t.Run("txt format with quotes", func(t *testing.T) { + san := New() // No sanitization + handler := NewSerializer("txt", san) + + var buf []byte handler.WriteString(&buf, "hello world") assert.Equal(t, `"hello world"`, string(buf)) - // String without spaces unquoted buf = nil - handler.WriteString(&buf, "single") - assert.Equal(t, "single", string(buf)) - - // Nil handling - buf = nil - handler.WriteNil(&buf) - assert.Equal(t, "null", string(buf)) - - // Quotes needed for empty or space-containing - assert.True(t, handler.NeedsQuotes("")) - assert.True(t, handler.NeedsQuotes("has space")) - assert.False(t, handler.NeedsQuotes("nospace")) + handler.WriteString(&buf, "nospace") + assert.Equal(t, "nospace", string(buf)) }) - t.Run("json format", func(t *testing.T) { - san := New(Escape) // Not used for JSON, direct escaping - handler := NewUnifiedHandler("json", san) + t.Run("json format escaping", func(t *testing.T) { + san := New() // JSON handler does its own escaping + handler := NewSerializer("json", san) var buf []byte - - // JSON escaping handler.WriteString(&buf, "line1\nline2\t\"quoted\"") assert.Equal(t, `"line1\nline2\t\"quoted\""`, string(buf)) - // Control char escaping buf = nil handler.WriteString(&buf, "null\x00byte") assert.Equal(t, `"null\u0000byte"`, string(buf)) - - // Always quotes - assert.True(t, handler.NeedsQuotes("anything")) }) t.Run("complex value handling", func(t *testing.T) { - san := New(HexEncode) + san := New() + handler := NewSerializer("raw", san) - // Raw uses spew - rawHandler := NewUnifiedHandler("raw", san) var buf []byte - rawHandler.WriteComplex(&buf, map[string]int{"a": 1}) + handler.WriteComplex(&buf, map[string]int{"a": 1}) assert.Contains(t, string(buf), "map[") - - // Txt/JSON use fmt.Sprintf - txtHandler := NewUnifiedHandler("txt", san) - buf = nil - txtHandler.WriteComplex(&buf, []int{1, 2, 3}) - assert.Contains(t, string(buf), "[1 2 3]") }) + + t.Run("nil handling", func(t *testing.T) { + san := New() + + rawHandler := NewSerializer("raw", san) + var buf []byte + rawHandler.WriteNil(&buf) + assert.Equal(t, "nil", string(buf)) + + jsonHandler := NewSerializer("json", san) + buf = nil + jsonHandler.WriteNil(&buf) + assert.Equal(t, "null", string(buf)) + }) +} + +func TestPolicyWithCustomRules(t *testing.T) { + s := New(). + Policy(PolicyTxt). + Rule(FilterControl, TransformStrip). + Rule(FilterWhitespace, TransformJSONEscape) + + // \x07 is non-printable AND control - matches PolicyTxt first (hex encode) + // \x7F is non-printable but NOT control - matches PolicyTxt (hex encode) + input := "a\x07b c\x7Fd" + result := s.Sanitize(input) + + assert.Equal(t, "a<07>b c<7f>d", result) // FIXED: \x07 now hex encoded } func BenchmarkSanitizer(b *testing.B) { input := strings.Repeat("normal text\x00\n\t", 100) benchmarks := []struct { - name string - mode Mode + name string + sanitizer *Sanitizer }{ - {"None", None}, - {"HexEncode", HexEncode}, - {"Strip", Strip}, - {"Escape", Escape}, + {"Passthrough", New()}, + {"SingleRule", New().Rule(FilterNonPrintable, TransformHexEncode)}, + {"Policy", New().Policy(PolicyTxt)}, + {"Complex", New(). + Policy(PolicyTxt). + Rule(FilterControl, TransformStrip). + Rule(FilterWhitespace, TransformJSONEscape)}, } for _, bm := range benchmarks { b.Run(bm.name, func(b *testing.B) { - s := New(bm.mode) b.ResetTimer() for i := 0; i < b.N; i++ { - _ = s.Sanitize(input) + _ = bm.sanitizer.Sanitize(input) } }) } +} + +func TestTransformPriority(t *testing.T) { + // Test that only one transform is applied per rule + s := New().Rule(FilterControl, TransformStrip|TransformHexEncode) + + // Should strip (first flag checked), not hex encode + assert.Equal(t, "ab", s.Sanitize("a\x00b")) } \ No newline at end of file diff --git a/storage_test.go b/storage_test.go index 9e7dcef..3f80269 100644 --- a/storage_test.go +++ b/storage_test.go @@ -19,7 +19,7 @@ func TestLogRotation(t *testing.T) { defer logger.Shutdown() cfg := logger.GetConfig() - cfg.MaxSizeKB = 1000 // 1MB + cfg.MaxSizeKB = 100 // 100KB cfg.FlushIntervalMs = 10 // Fast flush for testing logger.ApplyConfig(cfg) @@ -27,11 +27,11 @@ func TestLogRotation(t *testing.T) { // Account for timestamp, level, and other formatting overhead // A typical log line overhead is ~50-100 bytes const overhead = 100 - const targetMessageSize = 50000 // 50KB per message + const targetMessageSize = 5000 // 5KB per message largeData := strings.Repeat("x", targetMessageSize) // Write enough to exceed 1MB twice (should cause at least one rotation) - messagesNeeded := (2 * sizeMultiplier * 1000) / (targetMessageSize + overhead) // ~40 messages + messagesNeeded := int((2 * sizeMultiplier * cfg.MaxSizeKB) / (targetMessageSize + overhead)) // ~40 messages for i := 0; i < messagesNeeded; i++ { logger.Info(fmt.Sprintf("msg%d:", i), largeData)