// Unless explicitly stated otherwise all files in this repository are licensed
// under the Apache License Version 2.0.
// This product includes software developed at Datadog (https://www.datadoghq.com/).
// Copyright 2016-present Datadog, Inc.
package obfuscate
import (
"bytes"
"encoding/json"
"errors"
"fmt"
"strings"
"unicode"
"unicode/utf8"
sqllexer "github.com/DataDog/go-sqllexer"
"github.com/outcaste-io/ristretto/z"
)
var questionMark = []byte("?")
// metadataFinderFilter is a filter which attempts to collect metadata from a query, such as comments and tables.
// It is meant to run before all the other filters.
type metadataFinderFilter struct {
collectTableNames bool
collectCommands bool
collectComments bool
replaceDigits bool
// size holds the byte size of the metadata collected by the filter.
size int64
// tablesSeen keeps track of unique table names encountered by the filter.
tablesSeen map[string]struct{}
// tablesCSV specifies a comma-separated list of tables.
tablesCSV strings.Builder
// commands keeps track of commands encountered by the filter.
commands []string
// comments keeps track of comments encountered by the filter.
comments []string
}
func (f *metadataFinderFilter) Filter(token, lastToken TokenKind, buffer []byte) (TokenKind, []byte, error) {
if f.collectComments && token == Comment {
// A comment with line-breaks will be brought to a single line.
comment := strings.TrimSpace(strings.ReplaceAll(string(buffer), "\n", " "))
f.size += int64(len(comment))
f.comments = append(f.comments, comment)
}
if f.collectCommands {
switch token {
case Select, Update, Insert, Delete, Join, Alter, Drop, Create, Grant, Revoke, Commit, Begin, Truncate:
command := strings.ToUpper(token.String())
f.size += int64(len(command))
f.commands = append(f.commands, command)
}
}
if f.collectTableNames {
switch lastToken {
case From, Join:
// SELECT ... FROM [tableName]
// DELETE FROM [tableName]
// ... JOIN [tableName]
if r, _ := utf8.DecodeRune(buffer); !unicode.IsLetter(r) {
// first character in buffer is not a letter; we might have a nested
// query like SELECT * FROM (SELECT ...)
break
}
fallthrough
case Update, Into:
// UPDATE [tableName]
// INSERT INTO [tableName]
tableName := string(buffer)
if f.replaceDigits {
tableNameCopy := make([]byte, len(buffer))
copy(tableNameCopy, buffer)
tableName = string(replaceDigits(tableNameCopy))
}
f.storeTableName(tableName)
return TableName, buffer, nil
}
}
return token, buffer, nil
}
func (f *metadataFinderFilter) storeTableName(name string) {
if _, ok := f.tablesSeen[name]; ok {
return
}
if f.tablesSeen == nil {
f.tablesSeen = make(map[string]struct{}, 1)
}
f.tablesSeen[name] = struct{}{}
if f.tablesCSV.Len() > 0 {
f.size++
f.tablesCSV.WriteByte(',')
}
f.size += int64(len(name))
f.tablesCSV.WriteString(name)
}
// Results returns metadata collected by the filter for an SQL statement.
func (f *metadataFinderFilter) Results() SQLMetadata {
return SQLMetadata{
Size: f.size,
TablesCSV: f.tablesCSV.String(),
Commands: f.commands,
Comments: f.comments,
}
}
// Reset implements tokenFilter.
func (f *metadataFinderFilter) Reset() {
for k := range f.tablesSeen {
delete(f.tablesSeen, k)
}
f.size = 0
f.tablesCSV.Reset()
f.commands = f.commands[:0]
f.comments = f.comments[:0]
}
// discardFilter is a token filter which discards certain elements from a query, such as
// comments and AS aliases by returning a nil buffer.
type discardFilter struct {
keepSQLAlias bool
}
// Filter the given token so that a `nil` slice is returned if the token is in the token filtered list.
func (f *discardFilter) Filter(token, lastToken TokenKind, buffer []byte) (TokenKind, []byte, error) {
// filters based on previous token
switch lastToken {
case FilteredBracketedIdentifier:
if token != ']' {
// we haven't found the closing bracket yet, keep going
if token != ID {
// the token between the brackets *must* be an identifier,
// otherwise the query is invalid.
return LexError, nil, fmt.Errorf("expected identifier in bracketed filter, got %d", token)
}
return FilteredBracketedIdentifier, nil, nil
}
fallthrough
case As:
if token == '[' {
// the identifier followed by AS is an MSSQL bracketed identifier
// and will continue to be discarded until we find the corresponding
// closing bracket counter-part. See GitHub issue DataDog/datadog-trace-agent#475.
return FilteredBracketedIdentifier, nil, nil
}
if f.keepSQLAlias {
return token, buffer, nil
}
return Filtered, nil, nil
}
// filters based on the current token; if the next token should be ignored,
// return the same token value (not FilteredGroupable) and nil
switch token {
case Comment:
return Filtered, nil, nil
case ';':
return markFilteredGroupable(token), nil, nil
case As:
if !f.keepSQLAlias {
return As, nil, nil
}
fallthrough
default:
return token, buffer, nil
}
}
// Reset implements tokenFilter.
func (f *discardFilter) Reset() {}
// replaceFilter is a token filter which obfuscates strings and numbers in queries by replacing them
// with the "?" character.
type replaceFilter struct {
replaceDigits bool
}
// Filter the given token so that it will be replaced if in the token replacement list
func (f *replaceFilter) Filter(token, lastToken TokenKind, buffer []byte) (tokenType TokenKind, tokenBytes []byte, err error) {
switch lastToken {
case Savepoint:
return markFilteredGroupable(token), questionMark, nil
case '=':
switch token {
case DoubleQuotedString:
// double-quoted strings after assignments are eligible for obfuscation
return markFilteredGroupable(token), questionMark, nil
}
}
switch token {
case DollarQuotedString, String, Number, Null, Variable, PreparedStatement, BooleanLiteral, EscapeSequence:
return markFilteredGroupable(token), questionMark, nil
case '?':
// Cases like 'ARRAY [ ?, ? ]' should be collapsed into 'ARRAY [ ? ]'
return markFilteredGroupable(token), questionMark, nil
case TableName, ID:
if f.replaceDigits {
return token, replaceDigits(buffer), nil
}
fallthrough
default:
return token, buffer, nil
}
}
// Reset implements tokenFilter.
func (f *replaceFilter) Reset() {}
// groupingFilter is a token filter which groups together items replaced by the replaceFilter. It is meant
// to run immediately after it.
type groupingFilter struct {
groupFilter int // counts the number of values, e.g. 3 = ?, ?, ?
groupMulti int // counts the number of groups, e.g. 2 = (?, ?), (?, ?, ?)
}
// Filter the given token so that it will be discarded if a grouping pattern
// has been recognized. A grouping is composed by items like:
// - '( ?, ?, ? )'
// - '( ?, ? ), ( ?, ? )'
func (f *groupingFilter) Filter(token, lastToken TokenKind, buffer []byte) (tokenType TokenKind, tokenBytes []byte, err error) {
// increasing the number of groups means that we're filtering an entire group
// because it can be represented with a single '( ? )'
if (lastToken == '(' && isFilteredGroupable(token)) || (token == '(' && f.groupMulti > 0) {
f.groupMulti++
}
// Potential commands that could indicate the start of a subquery.
isStartOfSubquery := token == Select || token == Delete || token == Update || token == ID
switch {
case f.groupMulti > 0 && lastToken == FilteredGroupableParenthesis && isStartOfSubquery:
// this is the start of a new group that seems to be a nested query;
// cancel grouping.
f.Reset()
return token, append([]byte("( "), buffer...), nil
case isFilteredGroupable(token):
// the previous filter has dropped this token so we should start
// counting the group filter so that we accept only one '?' for
// the same group
f.groupFilter++
if f.groupFilter > 1 {
return markFilteredGroupable(token), nil, nil
}
case f.groupFilter > 0 && (token == ',' || token == '?'):
// if we are in a group drop all commas
return markFilteredGroupable(token), nil, nil
case f.groupMulti > 1:
// drop all tokens since we're in a counting group
// and they're duplicated
return markFilteredGroupable(token), nil, nil
case token != ',' && token != '(' && token != ')' && !isFilteredGroupable(token):
// when we're out of a group reset the filter state
f.Reset()
}
return token, buffer, nil
}
// isFilteredGroupable reports whether token is to be considered filtered groupable.
func isFilteredGroupable(token TokenKind) bool {
switch token {
case FilteredGroupable, FilteredGroupableParenthesis:
return true
default:
return false
}
}
// markFilteredGroupable returns the appropriate TokenKind to mark this token as
// filtered groupable.
func markFilteredGroupable(token TokenKind) TokenKind {
switch token {
case '(':
return FilteredGroupableParenthesis
default:
return FilteredGroupable
}
}
// Reset resets the groupingFilter so that it may be used again.
func (f *groupingFilter) Reset() {
f.groupFilter = 0
f.groupMulti = 0
}
func isSQLLexer(obfuscationMode ObfuscationMode) bool {
return obfuscationMode != ""
}
// ObfuscateSQLString quantizes and obfuscates the given input SQL query string. Quantization removes
// some elements such as comments and aliases and obfuscation attempts to hide sensitive information
// in strings and numbers by redacting them.
func (o *Obfuscator) ObfuscateSQLString(in string) (*ObfuscatedQuery, error) {
return o.ObfuscateSQLStringWithOptions(in, &o.opts.SQL, o.sqlOptsStr)
}
// ObfuscateSQLStringForDBMS quantizes and obfuscates the given input SQL query string for a specific DBMS.
func (o *Obfuscator) ObfuscateSQLStringForDBMS(in string, dbms string) (*ObfuscatedQuery, error) {
if isSQLLexer(o.opts.SQL.ObfuscationMode) {
o.opts.SQL.DBMS = dbms
}
return o.ObfuscateSQLStringWithOptions(in, &o.opts.SQL, o.sqlOptsStr)
}
// ObfuscateSQLStringWithOptions accepts an optional SQLOptions to change the behavior of the obfuscator
// to quantize and obfuscate the given input SQL query string. Quantization removes some elements such as comments
// and aliases and obfuscation attempts to hide sensitive information in strings and numbers by redacting them.
func (o *Obfuscator) ObfuscateSQLStringWithOptions(in string, opts *SQLConfig, optsStr string) (oq *ObfuscatedQuery, err error) {
var optsStrError error
if optsStr == "" {
var optsBytes []byte
// Ideally we should never fallback to this because it's expensive
o.log.Warnf("falling back to JSON marshalling of SQLConfig options, this should be avoided if possible")
optsBytes, optsStrError = json.Marshal(opts)
optsStr = string(optsBytes)
}
if optsStrError == nil && o.queryCache.Cache != nil {
cacheKey := z.MemHashString(in) ^ z.MemHashString(optsStr)
if v, ok := o.queryCache.Get(cacheKey); ok {
return v.(*ObfuscatedQuery), nil
}
defer func() {
if oq != nil && err == nil {
o.queryCache.Set(cacheKey, oq, oq.Cost())
}
}()
}
if opts.ObfuscationMode != "" {
// If obfuscation mode is specified, we will use go-sqllexer pkg
// to obfuscate (and normalize) the query.
oq, err = o.ObfuscateWithSQLLexer(in, opts)
} else {
oq, err = o.obfuscateSQLString(in, opts)
}
if err != nil {
return oq, err
}
return oq, nil
}
func (o *Obfuscator) obfuscateSQLString(in string, opts *SQLConfig) (*ObfuscatedQuery, error) {
lesc := o.useSQLLiteralEscapes()
tok := NewSQLTokenizer(in, lesc, opts)
out, err := attemptObfuscation(tok)
if err != nil && tok.SeenEscape() {
// If the tokenizer failed, but saw an escape character in the process,
// try again treating escapes differently
tok = NewSQLTokenizer(in, !lesc, opts)
if out, err2 := attemptObfuscation(tok); err2 == nil {
// If the second attempt succeeded, change the default behavior so that
// on the next run we get it right in the first run.
o.setSQLLiteralEscapes(!lesc)
return out, nil
}
}
return out, err
}
// ObfuscatedQuery specifies information about an obfuscated SQL query.
type ObfuscatedQuery struct {
Query string `json:"query"` // the obfuscated SQL query
Metadata SQLMetadata `json:"metadata"` // metadata extracted from the SQL query
}
// Cost returns the number of bytes needed to store all the fields
// of this ObfuscatedQuery.
func (oq *ObfuscatedQuery) Cost() int64 {
// The cost of the ObfuscatedQuery struct is the sum of the length of the query string,
// the size of the metadata content, and the size of the struct itself and its fields headers.
// 320 bytes come from
// - 112 bytes for the ObfuscatedQuery struct itself, measured by unsafe.Sizeof(ObfuscatedQuery{})
// - 96 bytes for the Metadata struct itself, measured by unsafe.Sizeof(SQLMetadata{})
// - 16 bytes for the Query string header
// - 16 bytes for the TablesCSV string header
// - 24 * 3 bytes for the Comments, Commands, and Procedures slices headers
// - 8 bytes for the Size int64 field
return int64(len(oq.Query)) + oq.Metadata.Size + 320
}
// attemptObfuscation attempts to obfuscate the SQL query loaded into the tokenizer, using the given set of filters.
func attemptObfuscation(tokenizer *SQLTokenizer) (*ObfuscatedQuery, error) {
var (
out = bytes.NewBuffer(make([]byte, 0, len(tokenizer.buf)))
err error
lastToken TokenKind
metadata = metadataFinderFilter{
collectTableNames: tokenizer.cfg.TableNames,
collectCommands: tokenizer.cfg.CollectCommands,
collectComments: tokenizer.cfg.CollectComments,
replaceDigits: tokenizer.cfg.ReplaceDigits,
}
discard = discardFilter{keepSQLAlias: tokenizer.cfg.KeepSQLAlias}
replace = replaceFilter{replaceDigits: tokenizer.cfg.ReplaceDigits}
grouping groupingFilter
)
defer metadata.Reset()
// call Scan() function until tokens are available or if a LEX_ERROR is raised. After
// retrieving a token, send it to the tokenFilter chains so that the token is discarded
// or replaced.
for {
token, buff := tokenizer.Scan()
if token == EndChar {
break
}
if token == LexError {
return nil, fmt.Errorf("%v", tokenizer.Err())
}
if token, buff, err = metadata.Filter(token, lastToken, buff); err != nil {
return nil, err
}
if token, buff, err = discard.Filter(token, lastToken, buff); err != nil {
return nil, err
}
if token, buff, err = replace.Filter(token, lastToken, buff); err != nil {
return nil, err
}
if token, buff, err = grouping.Filter(token, lastToken, buff); err != nil {
return nil, err
}
if buff != nil {
if out.Len() != 0 {
switch token {
case ',':
case '=':
if lastToken == ':' {
// do not add a space before an equals if a colon was
// present before it.
break
}
fallthrough
default:
out.WriteRune(' ')
}
}
out.Write(buff)
}
lastToken = token
}
if out.Len() == 0 {
return nil, errors.New("result is empty")
}
return &ObfuscatedQuery{
Query: out.String(),
Metadata: metadata.Results(),
}, nil
}
// ObfuscateSQLExecPlan obfuscates query conditions in the provided JSON encoded execution plan. If normalize=True,
// then cost and row estimates are also obfuscated away.
func (o *Obfuscator) ObfuscateSQLExecPlan(jsonPlan string, normalize bool) (string, error) {
if normalize {
return o.sqlExecPlanNormalize.obfuscate([]byte(jsonPlan))
}
return o.sqlExecPlan.obfuscate([]byte(jsonPlan))
}
// ObfuscateWithSQLLexer obfuscates the given SQL query using the go-sqllexer package.
// If ObfuscationMode is set to ObfuscateOnly, the query will be obfuscated without normalizing it.
func (o *Obfuscator) ObfuscateWithSQLLexer(in string, opts *SQLConfig) (*ObfuscatedQuery, error) {
if opts.ObfuscationMode != NormalizeOnly && opts.ObfuscationMode != ObfuscateOnly && opts.ObfuscationMode != ObfuscateAndNormalize {
return nil, fmt.Errorf("invalid obfuscation mode: %s", opts.ObfuscationMode)
}
var obfuscator *sqllexer.Obfuscator
if opts.ObfuscationMode == ObfuscateOnly || opts.ObfuscationMode == ObfuscateAndNormalize {
obfuscator = sqllexer.NewObfuscator(
sqllexer.WithReplaceDigits(opts.ReplaceDigits),
sqllexer.WithDollarQuotedFunc(opts.DollarQuotedFunc),
sqllexer.WithReplacePositionalParameter(!opts.KeepPositionalParameter),
sqllexer.WithReplaceBoolean(!opts.KeepBoolean),
sqllexer.WithReplaceNull(!opts.KeepNull),
sqllexer.WithKeepJsonPath(opts.KeepJSONPath),
)
}
if opts.ObfuscationMode == ObfuscateOnly {
// Obfuscate the query without normalizing it.
out := obfuscator.Obfuscate(in, sqllexer.WithDBMS(sqllexer.DBMSType(opts.DBMS)))
return &ObfuscatedQuery{
Query: out,
}, nil
}
// Obfuscate the query and normalize it.
normalizer := sqllexer.NewNormalizer(
sqllexer.WithCollectComments(opts.CollectComments),
sqllexer.WithCollectCommands(opts.CollectCommands),
sqllexer.WithCollectTables(opts.TableNames),
sqllexer.WithCollectProcedures(opts.CollectProcedures),
sqllexer.WithKeepSQLAlias(opts.KeepSQLAlias),
sqllexer.WithRemoveSpaceBetweenParentheses(opts.RemoveSpaceBetweenParentheses),
sqllexer.WithKeepTrailingSemicolon(opts.KeepTrailingSemicolon),
sqllexer.WithKeepIdentifierQuotation(opts.KeepIdentifierQuotation),
)
var out string
var statementMetadata *sqllexer.StatementMetadata
var err error
if opts.ObfuscationMode == NormalizeOnly {
// Normalize the query without obfuscating it.
out, statementMetadata, err = normalizer.Normalize(in, sqllexer.WithDBMS(sqllexer.DBMSType(opts.DBMS)))
} else {
out, statementMetadata, err = sqllexer.ObfuscateAndNormalize(
in,
obfuscator,
normalizer,
sqllexer.WithDBMS(sqllexer.DBMSType(opts.DBMS)),
)
}
if err != nil {
return nil, err
}
oq := &ObfuscatedQuery{
Query: out,
Metadata: SQLMetadata{
Size: int64(statementMetadata.Size),
TablesCSV: strings.Join(statementMetadata.Tables, ","),
Commands: statementMetadata.Commands,
Comments: statementMetadata.Comments,
Procedures: statementMetadata.Procedures,
},
}
return oq, nil
}