feat: detect Interlisp sources as text (#8377)
Some checks are pending
/ release (push) Waiting to run
testing-integration / test-unit (push) Waiting to run
testing-integration / test-sqlite (push) Waiting to run
testing / backend-checks (push) Waiting to run
testing / frontend-checks (push) Waiting to run
testing / test-unit (push) Blocked by required conditions
testing / test-e2e (push) Blocked by required conditions
testing / test-remote-cacher (redis) (push) Blocked by required conditions
testing / test-remote-cacher (valkey) (push) Blocked by required conditions
testing / test-remote-cacher (garnet) (push) Blocked by required conditions
testing / test-remote-cacher (redict) (push) Blocked by required conditions
testing / test-mysql (push) Blocked by required conditions
testing / test-pgsql (push) Blocked by required conditions
testing / test-sqlite (push) Blocked by required conditions
testing / security-check (push) Blocked by required conditions

This PR detects Interlisp files (files that include "(DEFINE-FILE-INFO" somewhere near the start, and do not have an .LCOM extension) as text files and displays them as such in the web UI.
To check for extensions, I had to extend the `typesniffer.DetectContentType` function to accept an extra filename parameter—which could be useful for future filetype detection features. It is possible that a few of the places I modified pass a full file path instead of just passing a file name.

Implements #8184

## Checklist

### Tests

- I added test coverage for Go changes...
  - [x] in their respective `*_test.go` for unit tests.
  - [ ] in the `tests/integration` directory if it involves interactions with a live Forgejo server.
- I added test coverage for JavaScript changes... - NA
  - [ ] in `web_src/js/*.test.js` if it can be unit tested.
  - [ ] in `tests/e2e/*.test.e2e.js` if it requires interactions with a live Forgejo server (see also the [developer guide for JavaScript testing](https://codeberg.org/forgejo/forgejo/src/branch/forgejo/tests/e2e/README.md#end-to-end-tests)).

### Documentation

- [ ] I created a pull request [to the documentation](https://codeberg.org/forgejo/docs) to explain to Forgejo users how to use this change.
- [x] I did not document these changes and I do not expect someone else to do it.

### Release notes

- [ ] I do not want this change to show in the release notes.
- [x] I want the title to show in the release notes with a link to this pull request.
- [ ] I want the content of the `release-notes/<pull request number>.md` to be be used for the release notes instead of the title.

<!--start release-notes-assistant-->

## Release notes
<!--URL:https://codeberg.org/forgejo/forgejo-->
- Features
  - [PR](https://codeberg.org/forgejo/forgejo/pulls/8377): <!--number 8377 --><!--line 0 --><!--description ZGV0ZWN0IEludGVybGlzcCBzb3VyY2VzIGFzIHRleHQ=-->detect Interlisp sources as text<!--description-->
<!--end release-notes-assistant-->

Reviewed-on: https://codeberg.org/forgejo/forgejo/pulls/8377
Reviewed-by: Gusted <gusted@noreply.codeberg.org>
Co-authored-by: Bojidar Marinov <bojidar.marinov.bg@gmail.com>
Co-committed-by: Bojidar Marinov <bojidar.marinov.bg@gmail.com>
This commit is contained in:
Bojidar Marinov 2025-07-02 07:38:46 +02:00 committed by Earl Warren
parent 6f501b1fdf
commit 1ed750a33a
12 changed files with 82 additions and 61 deletions

View file

@ -220,7 +220,7 @@ func (b *Blob) GuessContentType() (typesniffer.SniffedType, error) {
}
defer r.Close()
return typesniffer.DetectContentTypeFromReader(r)
return typesniffer.DetectContentTypeFromReader(r, b.Name())
}
// GetBlob finds the blob object in the repository.

View file

@ -99,7 +99,7 @@ func setServeHeadersByFile(r *http.Request, w http.ResponseWriter, filePath stri
Filename: path.Base(filePath),
}
sniffedType := typesniffer.DetectContentType(mineBuf)
sniffedType := typesniffer.DetectContentType(mineBuf, opts.Filename)
// the "render" parameter came from year 2016: 638dd24c, it doesn't have clear meaning, so I think it could be removed later
isPlain := sniffedType.IsText() || r.FormValue("render") != ""

View file

@ -177,7 +177,7 @@ func (b *Indexer) addUpdate(ctx context.Context, batchWriter git.WriteCloserErro
fileContents, err := io.ReadAll(io.LimitReader(batchReader, size))
if err != nil {
return err
} else if !typesniffer.DetectContentType(fileContents).IsText() {
} else if !typesniffer.DetectContentType(fileContents, update.Filename).IsText() {
// FIXME: UTF-16 files will probably fail here
// Even if the file is not recognized as a "text file", we could still put its name into the indexers to make the filename become searchable, while leave the content to empty.
fileContents = nil

View file

@ -144,7 +144,7 @@ func (b *Indexer) addUpdate(ctx context.Context, batchWriter git.WriteCloserErro
fileContents, err := io.ReadAll(io.LimitReader(batchReader, size))
if err != nil {
return nil, err
} else if !typesniffer.DetectContentType(fileContents).IsText() {
} else if !typesniffer.DetectContentType(fileContents, update.Filename).IsText() {
// FIXME: UTF-16 files will probably fail here
return nil, nil
}

View file

@ -124,7 +124,7 @@ func (ct SniffedType) GetMimeType() string {
}
// DetectContentType extends http.DetectContentType with more content types. Defaults to text/unknown if input is empty.
func DetectContentType(data []byte) SniffedType {
func DetectContentType(data []byte, filename string) SniffedType {
if len(data) == 0 {
return SniffedType{"text/unknown"}
}
@ -176,6 +176,13 @@ func DetectContentType(data []byte) SniffedType {
}
}
if ct == "application/octet-stream" &&
filename != "" &&
!strings.HasSuffix(strings.ToUpper(filename), ".LCOM") &&
bytes.Contains(data, []byte("(DEFINE-FILE-INFO ")) {
ct = "text/vnd.interlisp"
}
// GLTF is unsupported by http.DetectContentType
// hexdump -n 4 -C glTF.glb
if bytes.HasPrefix(data, []byte("glTF")) {
@ -186,7 +193,7 @@ func DetectContentType(data []byte) SniffedType {
}
// DetectContentTypeFromReader guesses the content type contained in the reader.
func DetectContentTypeFromReader(r io.Reader) (SniffedType, error) {
func DetectContentTypeFromReader(r io.Reader, filename string) (SniffedType, error) {
buf := make([]byte, sniffLen)
n, err := util.ReadAtMost(r, buf)
if err != nil {
@ -194,5 +201,5 @@ func DetectContentTypeFromReader(r io.Reader) (SniffedType, error) {
}
buf = buf[:n]
return DetectContentType(buf), nil
return DetectContentType(buf, filename), nil
}

View file

@ -16,63 +16,63 @@ import (
func TestDetectContentTypeLongerThanSniffLen(t *testing.T) {
// Pre-condition: Shorter than sniffLen detects SVG.
assert.Equal(t, "image/svg+xml", DetectContentType([]byte(`<!-- Comment --><svg></svg>`)).contentType)
assert.Equal(t, "image/svg+xml", DetectContentType([]byte(`<!-- Comment --><svg></svg>`), "").contentType)
// Longer than sniffLen detects something else.
assert.NotEqual(t, "image/svg+xml", DetectContentType([]byte(`<!-- `+strings.Repeat("x", sniffLen)+` --><svg></svg>`)).contentType)
assert.NotEqual(t, "image/svg+xml", DetectContentType([]byte(`<!-- `+strings.Repeat("x", sniffLen)+` --><svg></svg>`), "").contentType)
}
func TestIsTextFile(t *testing.T) {
assert.True(t, DetectContentType([]byte{}).IsText())
assert.True(t, DetectContentType([]byte("lorem ipsum")).IsText())
assert.True(t, DetectContentType([]byte{}, "").IsText())
assert.True(t, DetectContentType([]byte("lorem ipsum"), "").IsText())
}
func TestIsSvgImage(t *testing.T) {
assert.True(t, DetectContentType([]byte("<svg></svg>")).IsSvgImage())
assert.True(t, DetectContentType([]byte(" <svg></svg>")).IsSvgImage())
assert.True(t, DetectContentType([]byte(`<svg width="100"></svg>`)).IsSvgImage())
assert.True(t, DetectContentType([]byte(`<?xml version="1.0" encoding="UTF-8"?><svg></svg>`)).IsSvgImage())
assert.True(t, DetectContentType([]byte("<svg></svg>"), "").IsSvgImage())
assert.True(t, DetectContentType([]byte(" <svg></svg>"), "").IsSvgImage())
assert.True(t, DetectContentType([]byte(`<svg width="100"></svg>`), "").IsSvgImage())
assert.True(t, DetectContentType([]byte(`<?xml version="1.0" encoding="UTF-8"?><svg></svg>`), "").IsSvgImage())
assert.True(t, DetectContentType([]byte(`<!-- Comment -->
<svg></svg>`)).IsSvgImage())
<svg></svg>`), "").IsSvgImage())
assert.True(t, DetectContentType([]byte(`<!-- Multiple -->
<!-- Comments -->
<svg></svg>`)).IsSvgImage())
<svg></svg>`), "").IsSvgImage())
assert.True(t, DetectContentType([]byte(`<!-- Multiline
Comment -->
<svg></svg>`)).IsSvgImage())
<svg></svg>`), "").IsSvgImage())
assert.True(t, DetectContentType([]byte(`<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1 Basic//EN"
"http://www.w3.org/Graphics/SVG/1.1/DTD/svg11-basic.dtd">
<svg></svg>`)).IsSvgImage())
<svg></svg>`), "").IsSvgImage())
assert.True(t, DetectContentType([]byte(`<?xml version="1.0" encoding="UTF-8"?>
<!-- Comment -->
<svg></svg>`)).IsSvgImage())
<svg></svg>`), "").IsSvgImage())
assert.True(t, DetectContentType([]byte(`<?xml version="1.0" encoding="UTF-8"?>
<!-- Multiple -->
<!-- Comments -->
<svg></svg>`)).IsSvgImage())
<svg></svg>`), "").IsSvgImage())
assert.True(t, DetectContentType([]byte(`<?xml version="1.0" encoding="UTF-8"?>
<!-- Multiline
Comment -->
<svg></svg>`)).IsSvgImage())
<svg></svg>`), "").IsSvgImage())
assert.True(t, DetectContentType([]byte(`<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
<!-- Multiline
Comment -->
<svg></svg>`)).IsSvgImage())
<svg></svg>`), "").IsSvgImage())
// the DetectContentType should work for incomplete data, because only beginning bytes are used for detection
assert.True(t, DetectContentType([]byte(`<svg>....`)).IsSvgImage())
assert.True(t, DetectContentType([]byte(`<svg>....`), "").IsSvgImage())
assert.False(t, DetectContentType([]byte{}).IsSvgImage())
assert.False(t, DetectContentType([]byte("svg")).IsSvgImage())
assert.False(t, DetectContentType([]byte("<svgfoo></svgfoo>")).IsSvgImage())
assert.False(t, DetectContentType([]byte("text<svg></svg>")).IsSvgImage())
assert.False(t, DetectContentType([]byte("<html><body><svg></svg></body></html>")).IsSvgImage())
assert.False(t, DetectContentType([]byte(`<script>"<svg></svg>"</script>`)).IsSvgImage())
assert.False(t, DetectContentType([]byte{}, "").IsSvgImage())
assert.False(t, DetectContentType([]byte("svg"), "").IsSvgImage())
assert.False(t, DetectContentType([]byte("<svgfoo></svgfoo>"), "").IsSvgImage())
assert.False(t, DetectContentType([]byte("text<svg></svg>"), "").IsSvgImage())
assert.False(t, DetectContentType([]byte("<html><body><svg></svg></body></html>"), "").IsSvgImage())
assert.False(t, DetectContentType([]byte(`<script>"<svg></svg>"</script>`), "").IsSvgImage())
assert.False(t, DetectContentType([]byte(`<!-- <svg></svg> inside comment -->
<foo></foo>`)).IsSvgImage())
<foo></foo>`), "").IsSvgImage())
assert.False(t, DetectContentType([]byte(`<?xml version="1.0" encoding="UTF-8"?>
<!-- <svg></svg> inside comment -->
<foo></foo>`)).IsSvgImage())
<foo></foo>`), "").IsSvgImage())
assert.False(t, DetectContentType([]byte(`
<!-- comment1 -->
@ -80,7 +80,7 @@ func TestIsSvgImage(t *testing.T) {
<!-- comment2 -->
<svg></svg>
</div>
`)).IsSvgImage())
`), "").IsSvgImage())
assert.False(t, DetectContentType([]byte(`
<!-- comment1
@ -90,56 +90,56 @@ func TestIsSvgImage(t *testing.T) {
-->
<svg></svg>
</div>
`)).IsSvgImage())
assert.False(t, DetectContentType([]byte(`<html><body><!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd"><svg></svg></body></html>`)).IsSvgImage())
assert.False(t, DetectContentType([]byte(`<html><body><?xml version="1.0" encoding="UTF-8"?><svg></svg></body></html>`)).IsSvgImage())
`), "").IsSvgImage())
assert.False(t, DetectContentType([]byte(`<html><body><!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd"><svg></svg></body></html>`), "").IsSvgImage())
assert.False(t, DetectContentType([]byte(`<html><body><?xml version="1.0" encoding="UTF-8"?><svg></svg></body></html>`), "").IsSvgImage())
}
func TestIsPDF(t *testing.T) {
pdf, _ := base64.StdEncoding.DecodeString("JVBERi0xLjYKJcOkw7zDtsOfCjIgMCBvYmoKPDwvTGVuZ3RoIDMgMCBSL0ZpbHRlci9GbGF0ZURlY29kZT4+CnN0cmVhbQp4nF3NPwsCMQwF8D2f4s2CNYk1baF0EHRwOwg4iJt/NsFb/PpevUE4Mjwe")
assert.True(t, DetectContentType(pdf).IsPDF())
assert.False(t, DetectContentType([]byte("plain text")).IsPDF())
assert.True(t, DetectContentType(pdf, "").IsPDF())
assert.False(t, DetectContentType([]byte("plain text"), "").IsPDF())
}
func TestIsVideo(t *testing.T) {
mp4, _ := base64.StdEncoding.DecodeString("AAAAGGZ0eXBtcDQyAAAAAGlzb21tcDQyAAEI721vb3YAAABsbXZoZAAAAADaBlwX2gZcFwAAA+gA")
assert.True(t, DetectContentType(mp4).IsVideo())
assert.False(t, DetectContentType([]byte("plain text")).IsVideo())
assert.True(t, DetectContentType(mp4, "").IsVideo())
assert.False(t, DetectContentType([]byte("plain text"), "").IsVideo())
}
func TestIsAudio(t *testing.T) {
mp3, _ := base64.StdEncoding.DecodeString("SUQzBAAAAAABAFRYWFgAAAASAAADbWFqb3JfYnJhbmQAbXA0MgBUWFhYAAAAEQAAA21pbm9yX3Zl")
assert.True(t, DetectContentType(mp3).IsAudio())
assert.False(t, DetectContentType([]byte("plain text")).IsAudio())
assert.True(t, DetectContentType(mp3, "").IsAudio())
assert.False(t, DetectContentType([]byte("plain text"), "").IsAudio())
assert.True(t, DetectContentType([]byte("ID3Toy\000")).IsAudio())
assert.True(t, DetectContentType([]byte("ID3Toy\n====\t* hi 🌞, ...")).IsText()) // test ID3 tag for plain text
assert.True(t, DetectContentType([]byte("ID3Toy\n====\t* hi 🌞, ..."+"🌛"[0:2])).IsText()) // test ID3 tag with incomplete UTF8 char
assert.True(t, DetectContentType([]byte("ID3Toy\000"), "").IsAudio())
assert.True(t, DetectContentType([]byte("ID3Toy\n====\t* hi 🌞, ..."), "").IsText()) // test ID3 tag for plain text
assert.True(t, DetectContentType([]byte("ID3Toy\n====\t* hi 🌞, ..."+"🌛"[0:2]), "").IsText()) // test ID3 tag with incomplete UTF8 char
}
func TestIsGLB(t *testing.T) {
glb, _ := hex.DecodeString("676c5446")
assert.True(t, DetectContentType(glb).IsGLB())
assert.True(t, DetectContentType(glb).Is3DModel())
assert.False(t, DetectContentType([]byte("plain text")).IsGLB())
assert.False(t, DetectContentType([]byte("plain text")).Is3DModel())
assert.True(t, DetectContentType(glb, "").IsGLB())
assert.True(t, DetectContentType(glb, "").Is3DModel())
assert.False(t, DetectContentType([]byte("plain text"), "").IsGLB())
assert.False(t, DetectContentType([]byte("plain text"), "").Is3DModel())
}
func TestDetectContentTypeFromReader(t *testing.T) {
mp3, _ := base64.StdEncoding.DecodeString("SUQzBAAAAAABAFRYWFgAAAASAAADbWFqb3JfYnJhbmQAbXA0MgBUWFhYAAAAEQAAA21pbm9yX3Zl")
st, err := DetectContentTypeFromReader(bytes.NewReader(mp3))
st, err := DetectContentTypeFromReader(bytes.NewReader(mp3), "")
require.NoError(t, err)
assert.True(t, st.IsAudio())
}
func TestDetectContentTypeOgg(t *testing.T) {
oggAudio, _ := hex.DecodeString("4f67675300020000000000000000352f0000000000007dc39163011e01766f72626973000000000244ac0000000000000071020000000000b8014f6767530000")
st, err := DetectContentTypeFromReader(bytes.NewReader(oggAudio))
st, err := DetectContentTypeFromReader(bytes.NewReader(oggAudio), "")
require.NoError(t, err)
assert.True(t, st.IsAudio())
oggVideo, _ := hex.DecodeString("4f676753000200000000000000007d9747ef000000009b59daf3012a807468656f7261030201001e00110001e000010e00020000001e00000001000001000001")
st, err = DetectContentTypeFromReader(bytes.NewReader(oggVideo))
st, err = DetectContentTypeFromReader(bytes.NewReader(oggVideo), "")
require.NoError(t, err)
assert.True(t, st.IsVideo())
}
@ -148,7 +148,7 @@ func TestDetectContentTypeAvif(t *testing.T) {
avifImage, err := hex.DecodeString("000000206674797061766966")
require.NoError(t, err)
st, err := DetectContentTypeFromReader(bytes.NewReader(avifImage))
st, err := DetectContentTypeFromReader(bytes.NewReader(avifImage), "")
require.NoError(t, err)
assert.True(t, st.IsImage())
@ -158,10 +158,24 @@ func TestDetectContentTypeModelGLB(t *testing.T) {
glb, err := hex.DecodeString("676c5446")
require.NoError(t, err)
st, err := DetectContentTypeFromReader(bytes.NewReader(glb))
st, err := DetectContentTypeFromReader(bytes.NewReader(glb), "")
require.NoError(t, err)
// print st for debugging
assert.Equal(t, "model/gltf-binary", st.GetMimeType())
assert.True(t, st.IsGLB())
}
func TestDetectInterlisp(t *testing.T) {
interlisp, err := base64.StdEncoding.DecodeString("ICAKKERFRklORS1GSUxFLUlORk8gHlBBQ0tBR0UgIklOVEVSTElTUCIgHlJFQURUQUJMRSAiSU5URVJMSVNQIiAeQkFTRSAxMCkKCgYB")
require.NoError(t, err)
st, err := DetectContentTypeFromReader(bytes.NewReader(interlisp), "test")
require.NoError(t, err)
assert.True(t, st.IsText())
st, err = DetectContentTypeFromReader(bytes.NewReader(interlisp), "")
require.NoError(t, err)
assert.False(t, st.IsText())
st, err = DetectContentTypeFromReader(bytes.NewReader(interlisp), "test.lcom")
require.NoError(t, err)
assert.False(t, st.IsText())
}

View file

@ -189,7 +189,7 @@ func editFile(ctx *context.Context, isNewFile bool) {
buf = buf[:n]
// Only some file types are editable online as text.
if !typesniffer.DetectContentType(buf).IsRepresentableAsText() {
if !typesniffer.DetectContentType(buf, blob.Name()).IsRepresentableAsText() {
ctx.NotFound("typesniffer.IsRepresentableAsText", nil)
return
}

View file

@ -41,7 +41,7 @@ func RenderFile(ctx *context.Context) {
n, _ := util.ReadAtMost(dataRc, buf)
buf = buf[:n]
st := typesniffer.DetectContentType(buf)
st := typesniffer.DetectContentType(buf, blob.Name())
isTextFile := st.IsText()
rd := charset.ToUTF8WithFallbackReader(io.MultiReader(bytes.NewReader(buf), dataRc), charset.ConvertOpts{})

View file

@ -45,7 +45,7 @@ func UpdateAvatarSetting(ctx *context.Context, form forms.AvatarForm) error {
if err != nil {
return fmt.Errorf("io.ReadAll: %w", err)
}
st := typesniffer.DetectContentType(data)
st := typesniffer.DetectContentType(data, "")
if !st.IsImage() || st.IsSvgImage() {
return errors.New(ctx.Locale.TrString("settings.uploaded_avatar_not_a_image"))
}

View file

@ -291,7 +291,7 @@ func LFSFileGet(ctx *context.Context) {
}
buf = buf[:n]
st := typesniffer.DetectContentType(buf)
st := typesniffer.DetectContentType(buf, "")
ctx.Data["IsTextFile"] = st.IsText()
isRepresentableAsText := st.IsRepresentableAsText()

View file

@ -228,7 +228,7 @@ func getFileReader(ctx gocontext.Context, repoID int64, blob *git.Blob) ([]byte,
n, _ := util.ReadAtMost(dataRc, buf)
buf = buf[:n]
st := typesniffer.DetectContentType(buf)
st := typesniffer.DetectContentType(buf, blob.Name())
isTextFile := st.IsText()
// FIXME: what happens when README file is an image?
@ -262,7 +262,7 @@ func getFileReader(ctx gocontext.Context, repoID int64, blob *git.Blob) ([]byte,
}
buf = buf[:n]
st = typesniffer.DetectContentType(buf)
st = typesniffer.DetectContentType(buf, blob.Name())
return buf, dataRc, &fileInfo{st.IsText(), true, meta.Size, &meta.Pointer, st}, nil
}

View file

@ -151,7 +151,7 @@ func UpdateAvatarSetting(ctx *context.Context, form *forms.AvatarForm, ctxUser *
return fmt.Errorf("io.ReadAll: %w", err)
}
st := typesniffer.DetectContentType(data)
st := typesniffer.DetectContentType(data, "")
if !st.IsImage() || st.IsSvgImage() {
return errors.New(ctx.Locale.TrString("settings.uploaded_avatar_not_a_image"))
}