Migrate to go-enry new version (#10906)

This commit is contained in:
Lauris BH 2020-04-15 20:40:39 +03:00 committed by GitHub
parent 7a67bcc204
commit 4dc62dadce
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
65 changed files with 111849 additions and 102276 deletions

View file

@ -0,0 +1,7 @@
// Package tokenizer implements file tokenization used by the enry content
// classifier. This package is an implementation detail of enry and should not
// be imported by other packages.
package tokenizer
// ByteLimit defines the maximum prefix of an input text that will be tokenized.
const ByteLimit = 100000

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,336 @@
#ifndef linguist_yyHEADER_H
#define linguist_yyHEADER_H 1
#define linguist_yyIN_HEADER 1
#line 6 "lex.linguist_yy.h"
#define YY_INT_ALIGNED short int
/* A lexical scanner generated by flex */
#define FLEX_SCANNER
#define YY_FLEX_MAJOR_VERSION 2
#define YY_FLEX_MINOR_VERSION 5
#define YY_FLEX_SUBMINOR_VERSION 35
#if YY_FLEX_SUBMINOR_VERSION > 0
#define FLEX_BETA
#endif
/* First, we deal with platform-specific or compiler-specific issues. */
/* begin standard C headers. */
#include <stdio.h>
#include <string.h>
#include <errno.h>
#include <stdlib.h>
/* end standard C headers. */
/* flex integer type definitions */
#ifndef FLEXINT_H
#define FLEXINT_H
/* C99 systems have <inttypes.h>. Non-C99 systems may or may not. */
#if defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
/* C99 says to define __STDC_LIMIT_MACROS before including stdint.h,
* if you want the limit (max/min) macros for int types.
*/
#ifndef __STDC_LIMIT_MACROS
#define __STDC_LIMIT_MACROS 1
#endif
#include <inttypes.h>
typedef int8_t flex_int8_t;
typedef uint8_t flex_uint8_t;
typedef int16_t flex_int16_t;
typedef uint16_t flex_uint16_t;
typedef int32_t flex_int32_t;
typedef uint32_t flex_uint32_t;
typedef uint64_t flex_uint64_t;
#else
typedef signed char flex_int8_t;
typedef short int flex_int16_t;
typedef int flex_int32_t;
typedef unsigned char flex_uint8_t;
typedef unsigned short int flex_uint16_t;
typedef unsigned int flex_uint32_t;
#endif /* ! C99 */
/* Limits of integral types. */
#ifndef INT8_MIN
#define INT8_MIN (-128)
#endif
#ifndef INT16_MIN
#define INT16_MIN (-32767-1)
#endif
#ifndef INT32_MIN
#define INT32_MIN (-2147483647-1)
#endif
#ifndef INT8_MAX
#define INT8_MAX (127)
#endif
#ifndef INT16_MAX
#define INT16_MAX (32767)
#endif
#ifndef INT32_MAX
#define INT32_MAX (2147483647)
#endif
#ifndef UINT8_MAX
#define UINT8_MAX (255U)
#endif
#ifndef UINT16_MAX
#define UINT16_MAX (65535U)
#endif
#ifndef UINT32_MAX
#define UINT32_MAX (4294967295U)
#endif
#endif /* ! FLEXINT_H */
#ifdef __cplusplus
/* The "const" storage-class-modifier is valid. */
#define YY_USE_CONST
#else /* ! __cplusplus */
/* C99 requires __STDC__ to be defined as 1. */
#if defined (__STDC__)
#define YY_USE_CONST
#endif /* defined (__STDC__) */
#endif /* ! __cplusplus */
#ifdef YY_USE_CONST
#define yyconst const
#else
#define yyconst
#endif
/* An opaque pointer. */
#ifndef YY_TYPEDEF_YY_SCANNER_T
#define YY_TYPEDEF_YY_SCANNER_T
typedef void* yyscan_t;
#endif
/* For convenience, these vars (plus the bison vars far below)
are macros in the reentrant scanner. */
#define yyin yyg->yyin_r
#define yyout yyg->yyout_r
#define yyextra yyg->yyextra_r
#define yyleng yyg->yyleng_r
#define yytext yyg->yytext_r
#define yylineno (YY_CURRENT_BUFFER_LVALUE->yy_bs_lineno)
#define yycolumn (YY_CURRENT_BUFFER_LVALUE->yy_bs_column)
#define yy_flex_debug yyg->yy_flex_debug_r
/* Size of default input buffer. */
#ifndef YY_BUF_SIZE
#define YY_BUF_SIZE 16384
#endif
#ifndef YY_TYPEDEF_YY_BUFFER_STATE
#define YY_TYPEDEF_YY_BUFFER_STATE
typedef struct yy_buffer_state *YY_BUFFER_STATE;
#endif
#ifndef YY_TYPEDEF_YY_SIZE_T
#define YY_TYPEDEF_YY_SIZE_T
typedef size_t yy_size_t;
#endif
#ifndef YY_STRUCT_YY_BUFFER_STATE
#define YY_STRUCT_YY_BUFFER_STATE
struct yy_buffer_state
{
FILE *yy_input_file;
char *yy_ch_buf; /* input buffer */
char *yy_buf_pos; /* current position in input buffer */
/* Size of input buffer in bytes, not including room for EOB
* characters.
*/
yy_size_t yy_buf_size;
/* Number of characters read into yy_ch_buf, not including EOB
* characters.
*/
yy_size_t yy_n_chars;
/* Whether we "own" the buffer - i.e., we know we created it,
* and can realloc() it to grow it, and should free() it to
* delete it.
*/
int yy_is_our_buffer;
/* Whether this is an "interactive" input source; if so, and
* if we're using stdio for input, then we want to use getc()
* instead of fread(), to make sure we stop fetching input after
* each newline.
*/
int yy_is_interactive;
/* Whether we're considered to be at the beginning of a line.
* If so, '^' rules will be active on the next match, otherwise
* not.
*/
int yy_at_bol;
int yy_bs_lineno; /**< The line count. */
int yy_bs_column; /**< The column count. */
/* Whether to try to fill the input buffer when we reach the
* end of it.
*/
int yy_fill_buffer;
int yy_buffer_status;
};
#endif /* !YY_STRUCT_YY_BUFFER_STATE */
void linguist_yyrestart (FILE *input_file ,yyscan_t yyscanner );
void linguist_yy_switch_to_buffer (YY_BUFFER_STATE new_buffer ,yyscan_t yyscanner );
YY_BUFFER_STATE linguist_yy_create_buffer (FILE *file,int size ,yyscan_t yyscanner );
void linguist_yy_delete_buffer (YY_BUFFER_STATE b ,yyscan_t yyscanner );
void linguist_yy_flush_buffer (YY_BUFFER_STATE b ,yyscan_t yyscanner );
void linguist_yypush_buffer_state (YY_BUFFER_STATE new_buffer ,yyscan_t yyscanner );
void linguist_yypop_buffer_state (yyscan_t yyscanner );
YY_BUFFER_STATE linguist_yy_scan_buffer (char *base,yy_size_t size ,yyscan_t yyscanner );
YY_BUFFER_STATE linguist_yy_scan_string (yyconst char *yy_str ,yyscan_t yyscanner );
YY_BUFFER_STATE linguist_yy_scan_bytes (yyconst char *bytes,yy_size_t len ,yyscan_t yyscanner );
void *linguist_yyalloc (yy_size_t ,yyscan_t yyscanner );
void *linguist_yyrealloc (void *,yy_size_t ,yyscan_t yyscanner );
void linguist_yyfree (void * ,yyscan_t yyscanner );
/* Begin user sect3 */
#define yytext_ptr yytext_r
#ifdef YY_HEADER_EXPORT_START_CONDITIONS
#define INITIAL 0
#define sgml 1
#define c_comment 2
#define xml_comment 3
#define haskell_comment 4
#define ocaml_comment 5
#define python_dcomment 6
#define python_scomment 7
#endif
#ifndef YY_NO_UNISTD_H
/* Special case for "unistd.h", since it is non-ANSI. We include it way
* down here because we want the user's section 1 to have been scanned first.
* The user has a chance to override it with an option.
*/
#include <unistd.h>
#endif
#define YY_EXTRA_TYPE struct tokenizer_extra *
int linguist_yylex_init (yyscan_t* scanner);
int linguist_yylex_init_extra (YY_EXTRA_TYPE user_defined,yyscan_t* scanner);
/* Accessor methods to globals.
These are made visible to non-reentrant scanners for convenience. */
int linguist_yylex_destroy (yyscan_t yyscanner );
int linguist_yyget_debug (yyscan_t yyscanner );
void linguist_yyset_debug (int debug_flag ,yyscan_t yyscanner );
YY_EXTRA_TYPE linguist_yyget_extra (yyscan_t yyscanner );
void linguist_yyset_extra (YY_EXTRA_TYPE user_defined ,yyscan_t yyscanner );
FILE *linguist_yyget_in (yyscan_t yyscanner );
void linguist_yyset_in (FILE * in_str ,yyscan_t yyscanner );
FILE *linguist_yyget_out (yyscan_t yyscanner );
void linguist_yyset_out (FILE * out_str ,yyscan_t yyscanner );
yy_size_t linguist_yyget_leng (yyscan_t yyscanner );
char *linguist_yyget_text (yyscan_t yyscanner );
int linguist_yyget_lineno (yyscan_t yyscanner );
void linguist_yyset_lineno (int line_number ,yyscan_t yyscanner );
/* Macros after this point can all be overridden by user definitions in
* section 1.
*/
#ifndef YY_SKIP_YYWRAP
#ifdef __cplusplus
extern "C" int linguist_yywrap (yyscan_t yyscanner );
#else
extern int linguist_yywrap (yyscan_t yyscanner );
#endif
#endif
#ifndef yytext_ptr
static void yy_flex_strncpy (char *,yyconst char *,int ,yyscan_t yyscanner);
#endif
#ifdef YY_NEED_STRLEN
static int yy_flex_strlen (yyconst char * ,yyscan_t yyscanner);
#endif
#ifndef YY_NO_INPUT
#endif
/* Amount of stuff to slurp up with each read. */
#ifndef YY_READ_BUF_SIZE
#define YY_READ_BUF_SIZE 8192
#endif
/* Number of entries by which start-condition stack grows. */
#ifndef YY_START_STACK_INCR
#define YY_START_STACK_INCR 25
#endif
/* Default declaration of generated scanner - a define so the user can
* easily add parameters.
*/
#ifndef YY_DECL
#define YY_DECL_IS_OURS 1
extern int linguist_yylex (yyscan_t yyscanner);
#define YY_DECL int linguist_yylex (yyscan_t yyscanner)
#endif /* !YY_DECL */
/* yy_get_previous_state - get the state just before the EOB char was reached */
#undef YY_NEW_FILE
#undef YY_FLUSH_BUFFER
#undef yy_set_bol
#undef yy_new_buffer
#undef yy_set_interactive
#undef YY_DO_BEFORE_ACTION
#ifdef YY_DECL_IS_OURS
#undef YY_DECL_IS_OURS
#undef YY_DECL
#endif
#line 118 "tokenizer.l"
#line 335 "lex.linguist_yy.h"
#undef linguist_yyIN_HEADER
#endif /* linguist_yyHEADER_H */

View file

@ -0,0 +1,15 @@
// https://github.com/github/linguist/blob/f72f2a21dfe80ebd16af3bc6216da75cd983a4f6/ext/linguist/linguist.h#L1
enum tokenizer_type {
NO_ACTION,
REGULAR_TOKEN,
SHEBANG_TOKEN,
SGML_TOKEN,
};
struct tokenizer_extra {
char *token;
enum tokenizer_type type;
};
// TODO(bzz) port Win support from
// https://github.com/github/linguist/commit/8e912b4d8bf2aef7948de59eba48b75cfcbc97e0

View file

@ -0,0 +1,73 @@
// +build flex
package flex
// #include <stdlib.h>
// #include "linguist.h"
// #include "lex.linguist_yy.h"
// int linguist_yywrap(yyscan_t yyscanner) {
// return 1;
// }
import "C"
import "unsafe"
const maxTokenLen = 32 // bytes
// TokenizeFlex implements tokenizer by calling Flex generated code from linguist in C
// This is a transliteration from C https://github.com/github/linguist/blob/master/ext/linguist/linguist.c#L12
func TokenizeFlex(content []byte) []string {
var buf C.YY_BUFFER_STATE
var scanner C.yyscan_t
var extra C.struct_tokenizer_extra
var _len C.ulong
var r C.int
_len = C.ulong(len(content))
cs := C.CBytes(content)
defer C.free(unsafe.Pointer(cs))
C.linguist_yylex_init_extra(&extra, &scanner)
buf = C.linguist_yy_scan_bytes((*C.char)(cs), _len, scanner)
ary := []string{}
for {
extra._type = C.NO_ACTION
extra.token = nil
r = C.linguist_yylex(scanner)
switch extra._type {
case C.NO_ACTION:
break
case C.REGULAR_TOKEN:
_len = C.strlen(extra.token)
if _len <= maxTokenLen {
ary = append(ary, C.GoStringN(extra.token, (C.int)(_len)))
}
C.free(unsafe.Pointer(extra.token))
break
case C.SHEBANG_TOKEN:
_len = C.strlen(extra.token)
if _len <= maxTokenLen {
s := "SHEBANG#!" + C.GoStringN(extra.token, (C.int)(_len))
ary = append(ary, s)
}
C.free(unsafe.Pointer(extra.token))
break
case C.SGML_TOKEN:
_len = C.strlen(extra.token)
if _len <= maxTokenLen {
s := C.GoStringN(extra.token, (C.int)(_len)) + ">"
ary = append(ary, s)
}
C.free(unsafe.Pointer(extra.token))
break
}
if r == 0 {
break
}
}
C.linguist_yy_delete_buffer(buf, scanner)
C.linguist_yylex_destroy(scanner)
return ary
}

View file

@ -0,0 +1,210 @@
// +build !flex
package tokenizer
import (
"bytes"
"github.com/go-enry/go-enry/v2/regex"
)
// Tokenize returns lexical tokens from content. The tokens returned match what
// the Linguist library returns. At most the first ByteLimit bytes of content are tokenized.
//
// BUG: Until https://github.com/src-d/enry/issues/193 is resolved, there are some
// differences between this function and the Linguist output.
func Tokenize(content []byte) []string {
if len(content) > ByteLimit {
content = content[:ByteLimit]
}
tokens := make([][]byte, 0, 50)
for _, extract := range extractTokens {
var extractedTokens [][]byte
content, extractedTokens = extract(content)
tokens = append(tokens, extractedTokens...)
}
return toString(tokens)
}
func toString(tokens [][]byte) []string {
stokens := make([]string, 0, len(tokens))
for _, token := range tokens {
stokens = append(stokens, string(token))
}
return stokens
}
var (
extractTokens = []func(content []byte) (replacedContent []byte, tokens [][]byte){
// The order to must be this
extractAndReplaceShebang,
extractAndReplaceSGML,
skipCommentsAndLiterals,
extractAndReplacePunctuation,
extractAndReplaceRegular,
extractAndReplaceOperator,
extractRemainders,
}
// Differences between golang regexp and oniguruma:
// 1. no (?s) in oniguruma - makes dot match \n
// 2. no (?U) in oniguruma - ungreedy *
// 3. (?m) implies dot matches \n in oniguruma
// 4. oniguruma handles \w differently - impossible, but true
//
// Workarounds:
// 1. (.|\n)
// 2. replace * with *?
// 3. replace . with [^\n]
// 4. replace \w with [0-9A-Za-z_]
//
// Original golang regexps:
//
// reLiteralStringQuotes = regexp.MustCompile(`(?sU)(".*"|'.*')`)
// reSingleLineComment = regexp.MustCompile(`(?m)(//|--|#|%|")\s(.*$)`)
// reMultilineComment = regexp.MustCompile(`(?sU)(/\*.*\*/|<!--.*-->|\{-.*-\}|\(\*.*\*\)|""".*"""|'''.*''')`)
// reLiteralNumber = regexp.MustCompile(`(0x[0-9A-Fa-f]([0-9A-Fa-f]|\.)*|\d(\d|\.)*)([uU][lL]{0,2}|([eE][-+]\d*)?[fFlL]*)`)
// reShebang = regexp.MustCompile(`(?m)^#!(?:/\w+)*/(?:(\w+)|\w+(?:\s*\w+=\w+\s*)*\s*(\w+))(?:\s*-\w+\s*)*$`)
// rePunctuation = regexp.MustCompile(`;|\{|\}|\(|\)|\[|\]`)
// reSGML = regexp.MustCompile(`(?sU)(<\/?[^\s<>=\d"']+)(?:\s.*\/?>|>)`)
// reSGMLComment = regexp.MustCompile(`(?sU)(<!--.*-->)`)
// reSGMLAttributes = regexp.MustCompile(`\s+(\w+=)|\s+([^\s>]+)`)
// reSGMLLoneAttribute = regexp.MustCompile(`(\w+)`)
// reRegularToken = regexp.MustCompile(`[\w\.@#\/\*]+`)
// reOperators = regexp.MustCompile(`<<?|\+|\-|\*|\/|%|&&?|\|\|?`)
//
// These regexps were converted to work in the same way for both engines:
//
reLiteralStringQuotes = regex.MustCompile(`("(.|\n)*?"|'(.|\n)*?')`)
reSingleLineComment = regex.MustCompile(`(?m)(//|--|#|%|")\s([^\n]*$)`)
reMultilineComment = regex.MustCompile(`(/\*(.|\n)*?\*/|<!--(.|\n)*?-->|\{-(.|\n)*?-\}|\(\*(.|\n)*?\*\)|"""(.|\n)*?"""|'''(.|\n)*?''')`)
reLiteralNumber = regex.MustCompile(`(0x[0-9A-Fa-f]([0-9A-Fa-f]|\.)*|\d(\d|\.)*)([uU][lL]{0,2}|([eE][-+]\d*)?[fFlL]*)`)
reShebang = regex.MustCompile(`(?m)^#!(?:/[0-9A-Za-z_]+)*/(?:([0-9A-Za-z_]+)|[0-9A-Za-z_]+(?:\s*[0-9A-Za-z_]+=[0-9A-Za-z_]+\s*)*\s*([0-9A-Za-z_]+))(?:\s*-[0-9A-Za-z_]+\s*)*$`)
rePunctuation = regex.MustCompile(`;|\{|\}|\(|\)|\[|\]`)
reSGML = regex.MustCompile(`(<\/?[^\s<>=\d"']+)(?:\s(.|\n)*?\/?>|>)`)
reSGMLComment = regex.MustCompile(`(<!--(.|\n)*?-->)`)
reSGMLAttributes = regex.MustCompile(`\s+([0-9A-Za-z_]+=)|\s+([^\s>]+)`)
reSGMLLoneAttribute = regex.MustCompile(`([0-9A-Za-z_]+)`)
reRegularToken = regex.MustCompile(`[0-9A-Za-z_\.@#\/\*]+`)
reOperators = regex.MustCompile(`<<?|\+|\-|\*|\/|%|&&?|\|\|?`)
regexToSkip = []regex.EnryRegexp{
// The order must be this
reLiteralStringQuotes,
reMultilineComment,
reSingleLineComment,
reLiteralNumber,
}
)
func extractAndReplaceShebang(content []byte) ([]byte, [][]byte) {
var shebangTokens [][]byte
matches := reShebang.FindAllSubmatch(content, -1)
if matches != nil {
shebangTokens = make([][]byte, 0, 2)
for _, match := range matches {
shebangToken := getShebangToken(match)
shebangTokens = append(shebangTokens, shebangToken)
}
reShebang.ReplaceAll(content, []byte(` `))
}
return content, shebangTokens
}
func getShebangToken(matchedShebang [][]byte) []byte {
const prefix = `SHEBANG#!`
var token []byte
for i := 1; i < len(matchedShebang); i++ {
if len(matchedShebang[i]) > 0 {
token = matchedShebang[i]
break
}
}
tokenShebang := append([]byte(prefix), token...)
return tokenShebang
}
func commonExtractAndReplace(content []byte, re regex.EnryRegexp) ([]byte, [][]byte) {
tokens := re.FindAll(content, -1)
content = re.ReplaceAll(content, []byte(` `))
return content, tokens
}
func extractAndReplacePunctuation(content []byte) ([]byte, [][]byte) {
return commonExtractAndReplace(content, rePunctuation)
}
func extractAndReplaceRegular(content []byte) ([]byte, [][]byte) {
return commonExtractAndReplace(content, reRegularToken)
}
func extractAndReplaceOperator(content []byte) ([]byte, [][]byte) {
return commonExtractAndReplace(content, reOperators)
}
func extractAndReplaceSGML(content []byte) ([]byte, [][]byte) {
var SGMLTokens [][]byte
matches := reSGML.FindAllSubmatch(content, -1)
if matches != nil {
SGMLTokens = make([][]byte, 0, 2)
for _, match := range matches {
if reSGMLComment.Match(match[0]) {
continue
}
token := append(append([]byte(nil), match[1]...), '>')
SGMLTokens = append(SGMLTokens, token)
attributes := getSGMLAttributes(match[0])
SGMLTokens = append(SGMLTokens, attributes...)
}
content = reSGML.ReplaceAll(content, []byte(` `))
}
return content, SGMLTokens
}
func getSGMLAttributes(SGMLTag []byte) [][]byte {
var attributes [][]byte
matches := reSGMLAttributes.FindAllSubmatch(SGMLTag, -1)
if matches != nil {
attributes = make([][]byte, 0, 5)
for _, match := range matches {
if len(match[1]) != 0 {
attributes = append(attributes, match[1])
}
if len(match[2]) != 0 {
loneAttributes := reSGMLLoneAttribute.FindAll(match[2], -1)
attributes = append(attributes, loneAttributes...)
}
}
}
return attributes
}
func skipCommentsAndLiterals(content []byte) ([]byte, [][]byte) {
for _, skip := range regexToSkip {
content = skip.ReplaceAll(content, []byte(` `))
}
return content, nil
}
func extractRemainders(content []byte) ([]byte, [][]byte) {
splitted := bytes.Fields(content)
remainderTokens := make([][]byte, 0, len(splitted)*3)
for _, remainder := range splitted {
remainders := bytes.Split(remainder, nil)
remainderTokens = append(remainderTokens, remainders...)
}
return content, remainderTokens
}

View file

@ -0,0 +1,15 @@
// +build flex
package tokenizer
import "github.com/go-enry/go-enry/v2/internal/tokenizer/flex"
// Tokenize returns lexical tokens from content. The tokens returned match what
// the Linguist library returns. At most the first ByteLimit bytes of content are tokenized.
func Tokenize(content []byte) []string {
if len(content) > ByteLimit {
content = content[:ByteLimit]
}
return flex.TokenizeFlex(content)
}