Migrate to go-enry new version (#10906)

2020-04-15 20:40:39 +03:00 · 2020-04-15 20:40:39 +03:00 · 4dc62dadce
commit 4dc62dadce
parent 7a67bcc204
65 changed files with 111849 additions and 102276 deletions
--- a/vendor/github.com/go-enry/go-enry/v2/internal/tokenizer/common.go
+++ b/vendor/github.com/go-enry/go-enry/v2/internal/tokenizer/common.go
@ -0,0 +1,7 @@
+// Package tokenizer implements file tokenization used by the enry content
+// classifier. This package is an implementation detail of enry and should not
+// be imported by other packages.
+package tokenizer
+
+// ByteLimit defines the maximum prefix of an input text that will be tokenized.
+const ByteLimit = 100000
--- a/vendor/github.com/go-enry/go-enry/v2/internal/tokenizer/flex/lex.linguist_yy.c
+++ b/vendor/github.com/go-enry/go-enry/v2/internal/tokenizer/flex/lex.linguist_yy.c
--- a/vendor/github.com/go-enry/go-enry/v2/internal/tokenizer/flex/lex.linguist_yy.h
+++ b/vendor/github.com/go-enry/go-enry/v2/internal/tokenizer/flex/lex.linguist_yy.h
@ -0,0 +1,336 @@
+#ifndef linguist_yyHEADER_H
+#define linguist_yyHEADER_H 1
+#define linguist_yyIN_HEADER 1
+
+#line 6 "lex.linguist_yy.h"
+
+#define  YY_INT_ALIGNED short int
+
+/* A lexical scanner generated by flex */
+
+#define FLEX_SCANNER
+#define YY_FLEX_MAJOR_VERSION 2
+#define YY_FLEX_MINOR_VERSION 5
+#define YY_FLEX_SUBMINOR_VERSION 35
+#if YY_FLEX_SUBMINOR_VERSION > 0
+#define FLEX_BETA
+#endif
+
+/* First, we deal with  platform-specific or compiler-specific issues. */
+
+/* begin standard C headers. */
+#include <stdio.h>
+#include <string.h>
+#include <errno.h>
+#include <stdlib.h>
+
+/* end standard C headers. */
+
+/* flex integer type definitions */
+
+#ifndef FLEXINT_H
+#define FLEXINT_H
+
+/* C99 systems have <inttypes.h>. Non-C99 systems may or may not. */
+
+#if defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
+
+/* C99 says to define __STDC_LIMIT_MACROS before including stdint.h,
+ * if you want the limit (max/min) macros for int types. 
+ */
+#ifndef __STDC_LIMIT_MACROS
+#define __STDC_LIMIT_MACROS 1
+#endif
+
+#include <inttypes.h>
+typedef int8_t flex_int8_t;
+typedef uint8_t flex_uint8_t;
+typedef int16_t flex_int16_t;
+typedef uint16_t flex_uint16_t;
+typedef int32_t flex_int32_t;
+typedef uint32_t flex_uint32_t;
+typedef uint64_t flex_uint64_t;
+#else
+typedef signed char flex_int8_t;
+typedef short int flex_int16_t;
+typedef int flex_int32_t;
+typedef unsigned char flex_uint8_t; 
+typedef unsigned short int flex_uint16_t;
+typedef unsigned int flex_uint32_t;
+#endif /* ! C99 */
+
+/* Limits of integral types. */
+#ifndef INT8_MIN
+#define INT8_MIN               (-128)
+#endif
+#ifndef INT16_MIN
+#define INT16_MIN              (-32767-1)
+#endif
+#ifndef INT32_MIN
+#define INT32_MIN              (-2147483647-1)
+#endif
+#ifndef INT8_MAX
+#define INT8_MAX               (127)
+#endif
+#ifndef INT16_MAX
+#define INT16_MAX              (32767)
+#endif
+#ifndef INT32_MAX
+#define INT32_MAX              (2147483647)
+#endif
+#ifndef UINT8_MAX
+#define UINT8_MAX              (255U)
+#endif
+#ifndef UINT16_MAX
+#define UINT16_MAX             (65535U)
+#endif
+#ifndef UINT32_MAX
+#define UINT32_MAX             (4294967295U)
+#endif
+
+#endif /* ! FLEXINT_H */
+
+#ifdef __cplusplus
+
+/* The "const" storage-class-modifier is valid. */
+#define YY_USE_CONST
+
+#else	/* ! __cplusplus */
+
+/* C99 requires __STDC__ to be defined as 1. */
+#if defined (__STDC__)
+
+#define YY_USE_CONST
+
+#endif	/* defined (__STDC__) */
+#endif	/* ! __cplusplus */
+
+#ifdef YY_USE_CONST
+#define yyconst const
+#else
+#define yyconst
+#endif
+
+/* An opaque pointer. */
+#ifndef YY_TYPEDEF_YY_SCANNER_T
+#define YY_TYPEDEF_YY_SCANNER_T
+typedef void* yyscan_t;
+#endif
+
+/* For convenience, these vars (plus the bison vars far below)
+   are macros in the reentrant scanner. */
+#define yyin yyg->yyin_r
+#define yyout yyg->yyout_r
+#define yyextra yyg->yyextra_r
+#define yyleng yyg->yyleng_r
+#define yytext yyg->yytext_r
+#define yylineno (YY_CURRENT_BUFFER_LVALUE->yy_bs_lineno)
+#define yycolumn (YY_CURRENT_BUFFER_LVALUE->yy_bs_column)
+#define yy_flex_debug yyg->yy_flex_debug_r
+
+/* Size of default input buffer. */
+#ifndef YY_BUF_SIZE
+#define YY_BUF_SIZE 16384
+#endif
+
+#ifndef YY_TYPEDEF_YY_BUFFER_STATE
+#define YY_TYPEDEF_YY_BUFFER_STATE
+typedef struct yy_buffer_state *YY_BUFFER_STATE;
+#endif
+
+#ifndef YY_TYPEDEF_YY_SIZE_T
+#define YY_TYPEDEF_YY_SIZE_T
+typedef size_t yy_size_t;
+#endif
+
+#ifndef YY_STRUCT_YY_BUFFER_STATE
+#define YY_STRUCT_YY_BUFFER_STATE
+struct yy_buffer_state
+	{
+	FILE *yy_input_file;
+
+	char *yy_ch_buf;		/* input buffer */
+	char *yy_buf_pos;		/* current position in input buffer */
+
+	/* Size of input buffer in bytes, not including room for EOB
+	 * characters.
+	 */
+	yy_size_t yy_buf_size;
+
+	/* Number of characters read into yy_ch_buf, not including EOB
+	 * characters.
+	 */
+	yy_size_t yy_n_chars;
+
+	/* Whether we "own" the buffer - i.e., we know we created it,
+	 * and can realloc() it to grow it, and should free() it to
+	 * delete it.
+	 */
+	int yy_is_our_buffer;
+
+	/* Whether this is an "interactive" input source; if so, and
+	 * if we're using stdio for input, then we want to use getc()
+	 * instead of fread(), to make sure we stop fetching input after
+	 * each newline.
+	 */
+	int yy_is_interactive;
+
+	/* Whether we're considered to be at the beginning of a line.
+	 * If so, '^' rules will be active on the next match, otherwise
+	 * not.
+	 */
+	int yy_at_bol;
+
+    int yy_bs_lineno; /**< The line count. */
+    int yy_bs_column; /**< The column count. */
+    
+	/* Whether to try to fill the input buffer when we reach the
+	 * end of it.
+	 */
+	int yy_fill_buffer;
+
+	int yy_buffer_status;
+
+	};
+#endif /* !YY_STRUCT_YY_BUFFER_STATE */
+
+void linguist_yyrestart (FILE *input_file ,yyscan_t yyscanner );
+void linguist_yy_switch_to_buffer (YY_BUFFER_STATE new_buffer ,yyscan_t yyscanner );
+YY_BUFFER_STATE linguist_yy_create_buffer (FILE *file,int size ,yyscan_t yyscanner );
+void linguist_yy_delete_buffer (YY_BUFFER_STATE b ,yyscan_t yyscanner );
+void linguist_yy_flush_buffer (YY_BUFFER_STATE b ,yyscan_t yyscanner );
+void linguist_yypush_buffer_state (YY_BUFFER_STATE new_buffer ,yyscan_t yyscanner );
+void linguist_yypop_buffer_state (yyscan_t yyscanner );
+
+YY_BUFFER_STATE linguist_yy_scan_buffer (char *base,yy_size_t size ,yyscan_t yyscanner );
+YY_BUFFER_STATE linguist_yy_scan_string (yyconst char *yy_str ,yyscan_t yyscanner );
+YY_BUFFER_STATE linguist_yy_scan_bytes (yyconst char *bytes,yy_size_t len ,yyscan_t yyscanner );
+
+void *linguist_yyalloc (yy_size_t ,yyscan_t yyscanner );
+void *linguist_yyrealloc (void *,yy_size_t ,yyscan_t yyscanner );
+void linguist_yyfree (void * ,yyscan_t yyscanner );
+
+/* Begin user sect3 */
+
+#define yytext_ptr yytext_r
+
+#ifdef YY_HEADER_EXPORT_START_CONDITIONS
+#define INITIAL 0
+#define sgml 1
+#define c_comment 2
+#define xml_comment 3
+#define haskell_comment 4
+#define ocaml_comment 5
+#define python_dcomment 6
+#define python_scomment 7
+
+#endif
+
+#ifndef YY_NO_UNISTD_H
+/* Special case for "unistd.h", since it is non-ANSI. We include it way
+ * down here because we want the user's section 1 to have been scanned first.
+ * The user has a chance to override it with an option.
+ */
+#include <unistd.h>
+#endif
+
+#define YY_EXTRA_TYPE struct tokenizer_extra *
+
+int linguist_yylex_init (yyscan_t* scanner);
+
+int linguist_yylex_init_extra (YY_EXTRA_TYPE user_defined,yyscan_t* scanner);
+
+/* Accessor methods to globals.
+   These are made visible to non-reentrant scanners for convenience. */
+
+int linguist_yylex_destroy (yyscan_t yyscanner );
+
+int linguist_yyget_debug (yyscan_t yyscanner );
+
+void linguist_yyset_debug (int debug_flag ,yyscan_t yyscanner );
+
+YY_EXTRA_TYPE linguist_yyget_extra (yyscan_t yyscanner );
+
+void linguist_yyset_extra (YY_EXTRA_TYPE user_defined ,yyscan_t yyscanner );
+
+FILE *linguist_yyget_in (yyscan_t yyscanner );
+
+void linguist_yyset_in  (FILE * in_str ,yyscan_t yyscanner );
+
+FILE *linguist_yyget_out (yyscan_t yyscanner );
+
+void linguist_yyset_out  (FILE * out_str ,yyscan_t yyscanner );
+
+yy_size_t linguist_yyget_leng (yyscan_t yyscanner );
+
+char *linguist_yyget_text (yyscan_t yyscanner );
+
+int linguist_yyget_lineno (yyscan_t yyscanner );
+
+void linguist_yyset_lineno (int line_number ,yyscan_t yyscanner );
+
+/* Macros after this point can all be overridden by user definitions in
+ * section 1.
+ */
+
+#ifndef YY_SKIP_YYWRAP
+#ifdef __cplusplus
+extern "C" int linguist_yywrap (yyscan_t yyscanner );
+#else
+extern int linguist_yywrap (yyscan_t yyscanner );
+#endif
+#endif
+
+#ifndef yytext_ptr
+static void yy_flex_strncpy (char *,yyconst char *,int ,yyscan_t yyscanner);
+#endif
+
+#ifdef YY_NEED_STRLEN
+static int yy_flex_strlen (yyconst char * ,yyscan_t yyscanner);
+#endif
+
+#ifndef YY_NO_INPUT
+
+#endif
+
+/* Amount of stuff to slurp up with each read. */
+#ifndef YY_READ_BUF_SIZE
+#define YY_READ_BUF_SIZE 8192
+#endif
+
+/* Number of entries by which start-condition stack grows. */
+#ifndef YY_START_STACK_INCR
+#define YY_START_STACK_INCR 25
+#endif
+
+/* Default declaration of generated scanner - a define so the user can
+ * easily add parameters.
+ */
+#ifndef YY_DECL
+#define YY_DECL_IS_OURS 1
+
+extern int linguist_yylex (yyscan_t yyscanner);
+
+#define YY_DECL int linguist_yylex (yyscan_t yyscanner)
+#endif /* !YY_DECL */
+
+/* yy_get_previous_state - get the state just before the EOB char was reached */
+
+#undef YY_NEW_FILE
+#undef YY_FLUSH_BUFFER
+#undef yy_set_bol
+#undef yy_new_buffer
+#undef yy_set_interactive
+#undef YY_DO_BEFORE_ACTION
+
+#ifdef YY_DECL_IS_OURS
+#undef YY_DECL_IS_OURS
+#undef YY_DECL
+#endif
+
+#line 118 "tokenizer.l"
+
+
+#line 335 "lex.linguist_yy.h"
+#undef linguist_yyIN_HEADER
+#endif /* linguist_yyHEADER_H */
--- a/vendor/github.com/go-enry/go-enry/v2/internal/tokenizer/flex/linguist.h
+++ b/vendor/github.com/go-enry/go-enry/v2/internal/tokenizer/flex/linguist.h
@ -0,0 +1,15 @@
+// https://github.com/github/linguist/blob/f72f2a21dfe80ebd16af3bc6216da75cd983a4f6/ext/linguist/linguist.h#L1
+enum tokenizer_type {
+  NO_ACTION,
+  REGULAR_TOKEN,
+  SHEBANG_TOKEN,
+  SGML_TOKEN,
+};
+
+struct tokenizer_extra {
+  char *token;
+  enum tokenizer_type type;
+};
+
+// TODO(bzz) port Win support from
+// https://github.com/github/linguist/commit/8e912b4d8bf2aef7948de59eba48b75cfcbc97e0
--- a/vendor/github.com/go-enry/go-enry/v2/internal/tokenizer/flex/tokenize_c.go
+++ b/vendor/github.com/go-enry/go-enry/v2/internal/tokenizer/flex/tokenize_c.go
@ -0,0 +1,73 @@
+// +build flex
+
+package flex
+
+// #include <stdlib.h>
+// #include "linguist.h"
+// #include "lex.linguist_yy.h"
+// int linguist_yywrap(yyscan_t yyscanner) {
+// 	return 1;
+// }
+import "C"
+import "unsafe"
+
+const maxTokenLen = 32 // bytes
+
+// TokenizeFlex implements tokenizer by calling Flex generated code from linguist in C
+// This is a transliteration from C https://github.com/github/linguist/blob/master/ext/linguist/linguist.c#L12
+func TokenizeFlex(content []byte) []string {
+	var buf C.YY_BUFFER_STATE
+	var scanner C.yyscan_t
+	var extra C.struct_tokenizer_extra
+	var _len C.ulong
+	var r C.int
+
+	_len = C.ulong(len(content))
+	cs := C.CBytes(content)
+	defer C.free(unsafe.Pointer(cs))
+
+	C.linguist_yylex_init_extra(&extra, &scanner)
+	buf = C.linguist_yy_scan_bytes((*C.char)(cs), _len, scanner)
+
+	ary := []string{}
+	for {
+		extra._type = C.NO_ACTION
+		extra.token = nil
+		r = C.linguist_yylex(scanner)
+		switch extra._type {
+		case C.NO_ACTION:
+			break
+		case C.REGULAR_TOKEN:
+			_len = C.strlen(extra.token)
+			if _len <= maxTokenLen {
+				ary = append(ary, C.GoStringN(extra.token, (C.int)(_len)))
+			}
+			C.free(unsafe.Pointer(extra.token))
+			break
+		case C.SHEBANG_TOKEN:
+			_len = C.strlen(extra.token)
+			if _len <= maxTokenLen {
+				s := "SHEBANG#!" + C.GoStringN(extra.token, (C.int)(_len))
+				ary = append(ary, s)
+			}
+			C.free(unsafe.Pointer(extra.token))
+			break
+		case C.SGML_TOKEN:
+			_len = C.strlen(extra.token)
+			if _len <= maxTokenLen {
+				s := C.GoStringN(extra.token, (C.int)(_len)) + ">"
+				ary = append(ary, s)
+			}
+			C.free(unsafe.Pointer(extra.token))
+			break
+		}
+		if r == 0 {
+			break
+		}
+	}
+
+	C.linguist_yy_delete_buffer(buf, scanner)
+	C.linguist_yylex_destroy(scanner)
+
+	return ary
+}
--- a/vendor/github.com/go-enry/go-enry/v2/internal/tokenizer/tokenize.go
+++ b/vendor/github.com/go-enry/go-enry/v2/internal/tokenizer/tokenize.go
@ -0,0 +1,210 @@
+// +build !flex
+
+package tokenizer
+
+import (
+	"bytes"
+
+	"github.com/go-enry/go-enry/v2/regex"
+)
+
+// Tokenize returns lexical tokens from content. The tokens returned match what
+// the Linguist library returns. At most the first ByteLimit bytes of content are tokenized.
+//
+// BUG: Until https://github.com/src-d/enry/issues/193 is resolved, there are some
+// differences between this function and the Linguist output.
+func Tokenize(content []byte) []string {
+	if len(content) > ByteLimit {
+		content = content[:ByteLimit]
+	}
+
+	tokens := make([][]byte, 0, 50)
+	for _, extract := range extractTokens {
+		var extractedTokens [][]byte
+		content, extractedTokens = extract(content)
+		tokens = append(tokens, extractedTokens...)
+	}
+
+	return toString(tokens)
+}
+
+func toString(tokens [][]byte) []string {
+	stokens := make([]string, 0, len(tokens))
+	for _, token := range tokens {
+		stokens = append(stokens, string(token))
+	}
+
+	return stokens
+}
+
+var (
+	extractTokens = []func(content []byte) (replacedContent []byte, tokens [][]byte){
+		// The order to must be this
+		extractAndReplaceShebang,
+		extractAndReplaceSGML,
+		skipCommentsAndLiterals,
+		extractAndReplacePunctuation,
+		extractAndReplaceRegular,
+		extractAndReplaceOperator,
+		extractRemainders,
+	}
+
+	// Differences between golang regexp and oniguruma:
+	// 1. no (?s) in oniguruma - makes dot match \n
+	// 2. no (?U) in oniguruma - ungreedy *
+	// 3. (?m) implies dot matches \n in oniguruma
+	// 4. oniguruma handles \w differently - impossible, but true
+	//
+	// Workarounds:
+	// 1. (.|\n)
+	// 2. replace * with *?
+	// 3. replace . with [^\n]
+	// 4. replace \w with [0-9A-Za-z_]
+	//
+	// Original golang regexps:
+	//
+	// reLiteralStringQuotes = regexp.MustCompile(`(?sU)(".*"|'.*')`)
+	// reSingleLineComment   = regexp.MustCompile(`(?m)(//|--|#|%|")\s(.*$)`)
+	// reMultilineComment    = regexp.MustCompile(`(?sU)(/\*.*\*/|<!--.*-->|\{-.*-\}|\(\*.*\*\)|""".*"""|'''.*''')`)
+	// reLiteralNumber       = regexp.MustCompile(`(0x[0-9A-Fa-f]([0-9A-Fa-f]|\.)*|\d(\d|\.)*)([uU][lL]{0,2}|([eE][-+]\d*)?[fFlL]*)`)
+	// reShebang             = regexp.MustCompile(`(?m)^#!(?:/\w+)*/(?:(\w+)|\w+(?:\s*\w+=\w+\s*)*\s*(\w+))(?:\s*-\w+\s*)*$`)
+	// rePunctuation         = regexp.MustCompile(`;|\{|\}|\(|\)|\[|\]`)
+	// reSGML                = regexp.MustCompile(`(?sU)(<\/?[^\s<>=\d"']+)(?:\s.*\/?>|>)`)
+	// reSGMLComment         = regexp.MustCompile(`(?sU)(<!--.*-->)`)
+	// reSGMLAttributes      = regexp.MustCompile(`\s+(\w+=)|\s+([^\s>]+)`)
+	// reSGMLLoneAttribute   = regexp.MustCompile(`(\w+)`)
+	// reRegularToken        = regexp.MustCompile(`[\w\.@#\/\*]+`)
+	// reOperators           = regexp.MustCompile(`<<?|\+|\-|\*|\/|%|&&?|\|\|?`)
+	//
+	// These regexps were converted to work in the same way for both engines:
+	//
+	reLiteralStringQuotes = regex.MustCompile(`("(.|\n)*?"|'(.|\n)*?')`)
+	reSingleLineComment   = regex.MustCompile(`(?m)(//|--|#|%|")\s([^\n]*$)`)
+	reMultilineComment    = regex.MustCompile(`(/\*(.|\n)*?\*/|<!--(.|\n)*?-->|\{-(.|\n)*?-\}|\(\*(.|\n)*?\*\)|"""(.|\n)*?"""|'''(.|\n)*?''')`)
+	reLiteralNumber       = regex.MustCompile(`(0x[0-9A-Fa-f]([0-9A-Fa-f]|\.)*|\d(\d|\.)*)([uU][lL]{0,2}|([eE][-+]\d*)?[fFlL]*)`)
+	reShebang             = regex.MustCompile(`(?m)^#!(?:/[0-9A-Za-z_]+)*/(?:([0-9A-Za-z_]+)|[0-9A-Za-z_]+(?:\s*[0-9A-Za-z_]+=[0-9A-Za-z_]+\s*)*\s*([0-9A-Za-z_]+))(?:\s*-[0-9A-Za-z_]+\s*)*$`)
+	rePunctuation         = regex.MustCompile(`;|\{|\}|\(|\)|\[|\]`)
+	reSGML                = regex.MustCompile(`(<\/?[^\s<>=\d"']+)(?:\s(.|\n)*?\/?>|>)`)
+	reSGMLComment         = regex.MustCompile(`(<!--(.|\n)*?-->)`)
+	reSGMLAttributes      = regex.MustCompile(`\s+([0-9A-Za-z_]+=)|\s+([^\s>]+)`)
+	reSGMLLoneAttribute   = regex.MustCompile(`([0-9A-Za-z_]+)`)
+	reRegularToken        = regex.MustCompile(`[0-9A-Za-z_\.@#\/\*]+`)
+	reOperators           = regex.MustCompile(`<<?|\+|\-|\*|\/|%|&&?|\|\|?`)
+
+	regexToSkip = []regex.EnryRegexp{
+		// The order must be this
+		reLiteralStringQuotes,
+		reMultilineComment,
+		reSingleLineComment,
+		reLiteralNumber,
+	}
+)
+
+func extractAndReplaceShebang(content []byte) ([]byte, [][]byte) {
+	var shebangTokens [][]byte
+	matches := reShebang.FindAllSubmatch(content, -1)
+	if matches != nil {
+		shebangTokens = make([][]byte, 0, 2)
+		for _, match := range matches {
+			shebangToken := getShebangToken(match)
+			shebangTokens = append(shebangTokens, shebangToken)
+		}
+
+		reShebang.ReplaceAll(content, []byte(` `))
+	}
+
+	return content, shebangTokens
+}
+
+func getShebangToken(matchedShebang [][]byte) []byte {
+	const prefix = `SHEBANG#!`
+	var token []byte
+	for i := 1; i < len(matchedShebang); i++ {
+		if len(matchedShebang[i]) > 0 {
+			token = matchedShebang[i]
+			break
+		}
+	}
+
+	tokenShebang := append([]byte(prefix), token...)
+	return tokenShebang
+}
+
+func commonExtractAndReplace(content []byte, re regex.EnryRegexp) ([]byte, [][]byte) {
+	tokens := re.FindAll(content, -1)
+	content = re.ReplaceAll(content, []byte(` `))
+	return content, tokens
+}
+
+func extractAndReplacePunctuation(content []byte) ([]byte, [][]byte) {
+	return commonExtractAndReplace(content, rePunctuation)
+}
+
+func extractAndReplaceRegular(content []byte) ([]byte, [][]byte) {
+	return commonExtractAndReplace(content, reRegularToken)
+}
+
+func extractAndReplaceOperator(content []byte) ([]byte, [][]byte) {
+	return commonExtractAndReplace(content, reOperators)
+}
+
+func extractAndReplaceSGML(content []byte) ([]byte, [][]byte) {
+	var SGMLTokens [][]byte
+	matches := reSGML.FindAllSubmatch(content, -1)
+	if matches != nil {
+		SGMLTokens = make([][]byte, 0, 2)
+		for _, match := range matches {
+			if reSGMLComment.Match(match[0]) {
+				continue
+			}
+
+			token := append(append([]byte(nil), match[1]...), '>')
+			SGMLTokens = append(SGMLTokens, token)
+			attributes := getSGMLAttributes(match[0])
+			SGMLTokens = append(SGMLTokens, attributes...)
+		}
+
+		content = reSGML.ReplaceAll(content, []byte(` `))
+	}
+
+	return content, SGMLTokens
+}
+
+func getSGMLAttributes(SGMLTag []byte) [][]byte {
+	var attributes [][]byte
+	matches := reSGMLAttributes.FindAllSubmatch(SGMLTag, -1)
+	if matches != nil {
+		attributes = make([][]byte, 0, 5)
+		for _, match := range matches {
+			if len(match[1]) != 0 {
+				attributes = append(attributes, match[1])
+			}
+
+			if len(match[2]) != 0 {
+				loneAttributes := reSGMLLoneAttribute.FindAll(match[2], -1)
+				attributes = append(attributes, loneAttributes...)
+			}
+		}
+	}
+
+	return attributes
+}
+
+func skipCommentsAndLiterals(content []byte) ([]byte, [][]byte) {
+	for _, skip := range regexToSkip {
+		content = skip.ReplaceAll(content, []byte(` `))
+	}
+
+	return content, nil
+}
+
+func extractRemainders(content []byte) ([]byte, [][]byte) {
+	splitted := bytes.Fields(content)
+	remainderTokens := make([][]byte, 0, len(splitted)*3)
+	for _, remainder := range splitted {
+		remainders := bytes.Split(remainder, nil)
+		remainderTokens = append(remainderTokens, remainders...)
+	}
+
+	return content, remainderTokens
+}
--- a/vendor/github.com/go-enry/go-enry/v2/internal/tokenizer/tokenize_c.go
+++ b/vendor/github.com/go-enry/go-enry/v2/internal/tokenizer/tokenize_c.go
@ -0,0 +1,15 @@
+// +build flex
+
+package tokenizer
+
+import "github.com/go-enry/go-enry/v2/internal/tokenizer/flex"
+
+// Tokenize returns lexical tokens from content. The tokens returned match what
+// the Linguist library returns. At most the first ByteLimit bytes of content are tokenized.
+func Tokenize(content []byte) []string {
+	if len(content) > ByteLimit {
+		content = content[:ByteLimit]
+	}
+
+	return flex.TokenizeFlex(content)
+}