Files
yoyo/internal/content/parser.go
Z.To 6807371c5e feat: add content filter and code processing module (v0.3.0)
- Add content filter module (internal/content/)
- Implement basic character filtering (control chars, line breaks, symbols)
- Implement code block and inline code detection
- Implement comment detection for 30+ languages (JS/Python/Go/HTML/etc)
- Add go-enry dependency for intelligent language detection
- Add SkipKeywords config option (default: TODO/FIXME/HACK/XXX/etc)
- Integrate content processing into Translator
- Update config.yaml with skip_keywords
2026-03-29 18:41:25 +08:00

454 lines
11 KiB
Go
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
package content
import (
"fmt"
"regexp"
"strings"
"github.com/go-enry/go-enry/v2"
)
type SegmentType int
const (
SegmentTypeText SegmentType = iota
SegmentTypeCodeBlock
SegmentTypeInlineCode
SegmentTypeComment
)
func (t SegmentType) String() string {
switch t {
case SegmentTypeText:
return "text"
case SegmentTypeCodeBlock:
return "code_block"
case SegmentTypeInlineCode:
return "inline_code"
case SegmentTypeComment:
return "comment"
default:
return "unknown"
}
}
type ContentSegment struct {
Type SegmentType
Content string
Translated string
Language string
IsComment bool
StartPos int
EndPos int
}
type ParseResult struct {
Segments []ContentSegment
SourceLang string
HasCode bool
}
type languageCommentPatterns struct {
LineComment string
BlockComment []string
}
var languagePatterns = map[string]languageCommentPatterns{
"javascript": {LineComment: `//`, BlockComment: []string{`/*`, `*/`}},
"typescript": {LineComment: `//`, BlockComment: []string{`/*`, `*/`}},
"java": {LineComment: `//`, BlockComment: []string{`/*`, `*/`}},
"kotlin": {LineComment: `//`, BlockComment: []string{`/*`, `*/`}},
"scala": {LineComment: `//`, BlockComment: []string{`/*`, `*/`}},
"c": {LineComment: `//`, BlockComment: []string{`/*`, `*/`}},
"cpp": {LineComment: `//`, BlockComment: []string{`/*`, `*/`}},
"c#": {LineComment: `//`, BlockComment: []string{`/*`, `*/`}},
"go": {LineComment: `//`, BlockComment: []string{`/*`, `*/`}},
"rust": {LineComment: `//`, BlockComment: []string{`/*`, `*/`}},
"php": {LineComment: `//`, BlockComment: []string{`/*`, `*/`}},
"swift": {LineComment: `//`, BlockComment: []string{`/*`, `*/`}},
"objective-c": {LineComment: `//`, BlockComment: []string{`/*`, `*/`}},
"scss": {LineComment: `//`, BlockComment: []string{`/*`, `*/`}},
"css": {LineComment: ``, BlockComment: []string{`/*`, `*/`}},
"less": {LineComment: ``, BlockComment: []string{`/*`, `*/`}},
"html": {LineComment: ``, BlockComment: []string{`<!--`, `-->`}},
"xml": {LineComment: ``, BlockComment: []string{`<!--`, `-->`}},
"sql": {LineComment: `--`, BlockComment: []string{`/*`, `*/`}},
"python": {LineComment: `#`, BlockComment: []string{`"""`, `"""`}},
"ruby": {LineComment: `#`, BlockComment: []string{`=begin`, `=end`}},
"shell": {LineComment: `#`, BlockComment: []string{}},
"bash": {LineComment: `#`, BlockComment: []string{}},
"powershell": {LineComment: `#`, BlockComment: []string{`<#`, `#>`}},
"yaml": {LineComment: `#()`, BlockComment: []string{}},
"json": {LineComment: ``, BlockComment: []string{}},
"markdown": {LineComment: ``, BlockComment: []string{}},
"vue": {LineComment: `//()`, BlockComment: []string{`/*`, `*/`, `<!--`, `-->`}},
"svelte": {LineComment: `//()`, BlockComment: []string{`/*`, `*/`}},
"jsx": {LineComment: `//()`, BlockComment: []string{`/*`, `*/`}},
"tsx": {LineComment: `//()`, BlockComment: []string{`/*`, `*/`}},
}
var defaultPatterns = languageCommentPatterns{
LineComment: `//`,
BlockComment: []string{`/*`, `*/`},
}
type Parser struct {
skipKeywords []string
fallbackLang string
}
func NewParser(skipKeywords []string) *Parser {
if skipKeywords == nil {
skipKeywords = []string{
"TODO", "FIXME", "HACK", "XXX", "NOTE",
"BUG", "WARN", "IMPORTANT", "TODO:",
"FIXME:", "HACK:", "XXX:", "NOTE:",
"BUG:", "WARN:", "IMPORTANT:",
}
}
return &Parser{
skipKeywords: skipKeywords,
fallbackLang: "javascript",
}
}
func (p *Parser) Parse(text string) (*ParseResult, error) {
result := &ParseResult{
Segments: []ContentSegment{},
}
detectedLang := p.detectLanguage(text)
result.SourceLang = detectedLang
segments := p.splitIntoSegments(text, result.SourceLang)
for _, seg := range segments {
if seg.Type == SegmentTypeCodeBlock || seg.Type == SegmentTypeInlineCode {
result.HasCode = true
}
result.Segments = append(result.Segments, seg)
}
return result, nil
}
func (p *Parser) detectLanguage(text string) string {
lines := strings.Split(text, "\n")
var codeLines []string
inCodeBlock := false
for _, line := range lines {
trimmed := strings.TrimSpace(line)
if strings.HasPrefix(trimmed, "```") {
inCodeBlock = !inCodeBlock
continue
}
if inCodeBlock && trimmed != "" {
codeLines = append(codeLines, trimmed)
}
}
if len(codeLines) == 0 {
for _, line := range lines {
if strings.TrimSpace(line) != "" {
codeLines = append(codeLines, line)
}
}
}
if len(codeLines) == 0 {
return p.fallbackLang
}
sample := strings.Join(codeLines[:min(len(codeLines), 10)], "\n")
lang := enry.GetLanguage("", []byte(sample))
if lang == "" {
return p.fallbackLang
}
return strings.ToLower(lang)
}
func (p *Parser) splitIntoSegments(text string, lang string) []ContentSegment {
segments := []ContentSegment{}
codeBlockPattern := regexp.MustCompile("(?s)```[\\s\\S]*?^```|`[^`]+`")
matches := codeBlockPattern.FindAllStringIndex(text, -1)
if len(matches) == 0 {
segments = append(segments, ContentSegment{
Type: SegmentTypeText,
Content: text,
StartPos: 0,
EndPos: len(text),
})
return segments
}
lastEnd := 0
for _, match := range matches {
start, end := match[0], match[1]
if start > lastEnd {
textPart := text[lastEnd:start]
textSegments := p.parseTextContent(textPart, lang)
segments = append(segments, textSegments...)
}
content := text[start:end]
isInline := len(content) > 0 && content[0] == '`' && (len(content) == 1 || content[len(content)-1] == '`')
if strings.HasPrefix(content, "```") {
segments = append(segments, ContentSegment{
Type: SegmentTypeCodeBlock,
Content: content,
Language: p.detectCodeBlockLang(content),
StartPos: start,
EndPos: end,
})
} else if isInline {
segments = append(segments, ContentSegment{
Type: SegmentTypeInlineCode,
Content: content,
Language: lang,
StartPos: start,
EndPos: end,
})
}
lastEnd = end
}
if lastEnd < len(text) {
textPart := text[lastEnd:]
textSegments := p.parseTextContent(textPart, lang)
segments = append(segments, textSegments...)
}
return segments
}
func (p *Parser) parseTextContent(text string, lang string) []ContentSegment {
segments := []ContentSegment{}
langPatterns := getLanguagePatterns(lang)
if langPatterns.SingleLine == "" && len(langPatterns.MultiLine) == 0 {
segments = append(segments, ContentSegment{
Type: SegmentTypeText,
Content: text,
Language: lang,
StartPos: 0,
EndPos: len(text),
})
return segments
}
commentPatterns := p.buildCommentRegex(langPatterns)
if commentPatterns == nil {
segments = append(segments, ContentSegment{
Type: SegmentTypeText,
Content: text,
Language: lang,
StartPos: 0,
EndPos: len(text),
})
return segments
}
matches := commentPatterns.FindAllStringIndex(text, -1)
if len(matches) == 0 {
segments = append(segments, ContentSegment{
Type: SegmentTypeText,
Content: text,
Language: lang,
StartPos: 0,
EndPos: len(text),
})
return segments
}
lastEnd := 0
for _, match := range matches {
start, end := match[0], match[1]
if start > lastEnd {
segments = append(segments, ContentSegment{
Type: SegmentTypeText,
Content: text[lastEnd:start],
Language: lang,
StartPos: lastEnd,
EndPos: start,
})
}
segments = append(segments, ContentSegment{
Type: SegmentTypeComment,
Content: text[start:end],
IsComment: true,
Language: lang,
StartPos: start,
EndPos: end,
})
lastEnd = end
}
if lastEnd < len(text) {
segments = append(segments, ContentSegment{
Type: SegmentTypeText,
Content: text[lastEnd:],
Language: lang,
StartPos: lastEnd,
EndPos: len(text),
})
}
return segments
}
type languageCommentRegex struct {
SingleLine string
MultiLine []struct {
Start string
End string
}
}
func (p *Parser) buildCommentRegex(patterns languageCommentRegex) *regexp.Regexp {
var parts []string
if patterns.SingleLine != "" {
parts = append(parts, patterns.SingleLine+`.*$`)
}
for _, multi := range patterns.MultiLine {
if multi.Start != "" && multi.End != "" {
escapedStart := regexp.QuoteMeta(multi.Start)
escapedEnd := regexp.QuoteMeta(multi.End)
parts = append(parts, escapedStart+`[\s\S]*?`+escapedEnd)
}
}
if len(parts) == 0 {
return nil
}
pattern := `(?m)` + strings.Join(parts, "|")
return regexp.MustCompile(pattern)
}
func getLanguagePatterns(lang string) languageCommentRegex {
patterns, ok := languagePatterns[lang]
if !ok {
patterns = defaultPatterns
}
result := languageCommentRegex{
SingleLine: patterns.LineComment,
}
for _, bc := range patterns.BlockComment {
if len(bc) >= 2 {
result.MultiLine = append(result.MultiLine, struct {
Start string
End string
}{Start: bc[:len(bc)/2], End: bc[len(bc)/2:]})
}
}
return result
}
func (p *Parser) detectCodeBlockLang(codeBlock string) string {
lines := strings.Split(codeBlock, "\n")
if len(lines) < 2 {
return ""
}
firstLine := strings.TrimSpace(lines[0])
firstLine = strings.TrimPrefix(firstLine, "```")
firstLine = strings.TrimSpace(firstLine)
if firstLine != "" {
lang := strings.ToLower(firstLine)
if _, ok := languagePatterns[lang]; ok {
return lang
}
}
return ""
}
func (p *Parser) BuildPrompt(result *ParseResult) string {
var prompt strings.Builder
prompt.WriteString("你是一位专业的技术翻译。请翻译以下内容,遵守以下规则:\n\n")
prompt.WriteString("需要翻译的部分:\n")
prompt.WriteString("- 普通文本:翻译成目标语言\n")
prompt.WriteString("- 代码注释:只翻译注释中有意义的词汇,技术术语保留原语言\n\n")
prompt.WriteString("需要保持不变的部分:\n")
prompt.WriteString("- 代码块(如 ```javascript ... ```)保持原样\n")
prompt.WriteString("- 行内代码(如 `const count = 10`)保持原样\n")
if len(p.skipKeywords) > 0 {
prompt.WriteString(fmt.Sprintf("- 以下关键词不翻译:%s\n", strings.Join(p.skipKeywords, "、")))
}
prompt.WriteString("\n请将需要翻译的部分翻译成中文其他部分保持不变。\n\n")
prompt.WriteString("原文:\n---\n")
textToTranslate := p.extractTextForTranslation(result)
prompt.WriteString(textToTranslate)
prompt.WriteString("\n---")
return prompt.String()
}
func (p *Parser) extractTextForTranslation(result *ParseResult) string {
var text strings.Builder
for _, seg := range result.Segments {
switch seg.Type {
case SegmentTypeText:
text.WriteString(seg.Content)
case SegmentTypeComment:
text.WriteString(seg.Content)
case SegmentTypeCodeBlock, SegmentTypeInlineCode:
}
}
return text.String()
}
func (p *Parser) Reconstruct(result *ParseResult, translatedText string) string {
translatedLines := strings.Split(translatedText, "\n")
var output strings.Builder
textIndex := 0
for _, seg := range result.Segments {
switch seg.Type {
case SegmentTypeText, SegmentTypeComment:
if textIndex < len(translatedLines) {
output.WriteString(translatedLines[textIndex])
textIndex++
}
case SegmentTypeCodeBlock, SegmentTypeInlineCode:
output.WriteString(seg.Content)
}
}
return output.String()
}
func min(a, b int) int {
if a < b {
return a
}
return b
}