main #1
39
changelog.md
39
changelog.md
@@ -32,6 +32,45 @@
|
||||
|
||||
## 版本历史
|
||||
|
||||
### 0.3.0 (2026-03-29) - 内容过滤与代码处理
|
||||
**类型**: 功能版本
|
||||
**状态**: 开发中
|
||||
|
||||
**变更内容**:
|
||||
- ✅ 添加内容过滤模块 (internal/content/)
|
||||
- ✅ 实现基础字符过滤(移除控制字符、规范化换行符、截断超长符号)
|
||||
- ✅ 实现代码块和行内代码识别
|
||||
- ✅ 实现代码注释智能识别(支持 JS/TS/Java/Python/Go/HTML 等 30+ 语言)
|
||||
- ✅ 添加 go-enry 依赖实现编程语言智能检测
|
||||
- ✅ 添加 SkipKeywords 配置项,默认保留 TODO/FIXME/HACK 等关键词不翻译
|
||||
- ✅ 集成内容处理到 Translator 模块
|
||||
|
||||
**新增文件**:
|
||||
- `internal/content/content.go` - 模块入口
|
||||
- `internal/content/filter.go` - 基础字符过滤
|
||||
- `internal/content/parser.go` - 内容解析器和语言检测
|
||||
|
||||
**配置更新**:
|
||||
- `configs/config.yaml` 新增 `skip_keywords` 配置项
|
||||
- 支持用户自定义不翻译的关键词列表
|
||||
|
||||
**使用示例**:
|
||||
```bash
|
||||
# 翻译包含代码的文档,自动识别代码和注释
|
||||
yoyo "这是一个文档 ```js // TODO: fix this ```"
|
||||
# 代码块保持不变,只翻译注释中的词汇
|
||||
# TODO: 修复这个
|
||||
```
|
||||
|
||||
**讨论记录**:
|
||||
- [内容过滤与代码处理设计](taolun.md#内容过滤与代码处理设计)
|
||||
|
||||
**下一步**:
|
||||
- 实现更多厂商(火山引擎、国家超算、Qwen、OpenAI兼容)
|
||||
- 添加配置文件路径查找机制
|
||||
- 实现配置文件迁移工具
|
||||
- 完善错误处理和用户体验
|
||||
|
||||
### 0.2.0 (2026-03-29) - 语言支持和配置向导
|
||||
**类型**: 功能版本
|
||||
**状态**: 开发中
|
||||
|
||||
2
go.mod
2
go.mod
@@ -4,6 +4,8 @@ go 1.26.1
|
||||
|
||||
require (
|
||||
github.com/AlecAivazis/survey/v2 v2.3.7 // indirect
|
||||
github.com/go-enry/go-enry/v2 v2.9.5 // indirect
|
||||
github.com/go-enry/go-oniguruma v1.2.1 // indirect
|
||||
github.com/joho/godotenv v1.5.1 // indirect
|
||||
github.com/kballard/go-shellquote v0.0.0-20180428030007-95032a82bc51 // indirect
|
||||
github.com/mattn/go-colorable v0.1.2 // indirect
|
||||
|
||||
10
go.sum
10
go.sum
@@ -4,6 +4,10 @@ github.com/Netflix/go-expect v0.0.0-20220104043353-73e0943537d2/go.mod h1:HBCaDe
|
||||
github.com/creack/pty v1.1.17/go.mod h1:MOBLtS5ELjhRRrroQr9kyvTxUAFNvYEK993ew/Vr4O4=
|
||||
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
|
||||
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
|
||||
github.com/go-enry/go-enry/v2 v2.9.5 h1:HPhAQQHYwJgihL2PxBZiUMFWiROsGwOBdB6/D8zCUhY=
|
||||
github.com/go-enry/go-enry/v2 v2.9.5/go.mod h1:9yrj4ES1YrbNb1Wb7/PWYr2bpaCXUGRt0uafN0ISyG8=
|
||||
github.com/go-enry/go-oniguruma v1.2.1 h1:k8aAMuJfMrqm/56SG2lV9Cfti6tC4x8673aHCcBk+eo=
|
||||
github.com/go-enry/go-oniguruma v1.2.1/go.mod h1:bWDhYP+S6xZQgiRL7wlTScFYBe023B6ilRZbCAD5Hf4=
|
||||
github.com/hinshun/vt10x v0.0.0-20220119200601-820417d04eec/go.mod h1:Q48J4R4DvxnHolD5P8pOtXigYlRuPLGl6moFx3ulM68=
|
||||
github.com/joho/godotenv v1.5.1 h1:7eLL/+HRGLY0ldzfGMeQkb7vMd0as4CfYvUVzLqw0N0=
|
||||
github.com/joho/godotenv v1.5.1/go.mod h1:f4LDr5Voq0i2e/R5DDNOoa2zzDfwtkZa6DnEwAbqwq4=
|
||||
@@ -17,7 +21,12 @@ github.com/mgutz/ansi v0.0.0-20170206155736-9520e82c474b h1:j7+1HpAFS1zy5+Q4qx1f
|
||||
github.com/mgutz/ansi v0.0.0-20170206155736-9520e82c474b/go.mod h1:01TrycV0kFyexm33Z7vhZRXopbI8J3TDReVlkTgMUxE=
|
||||
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
|
||||
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
|
||||
github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw=
|
||||
github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo=
|
||||
github.com/stretchr/testify v1.6.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
|
||||
github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
|
||||
github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU=
|
||||
github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4=
|
||||
github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY=
|
||||
golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
|
||||
golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc=
|
||||
@@ -47,6 +56,7 @@ golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtn
|
||||
golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc=
|
||||
golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
|
||||
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
|
||||
gopkg.in/yaml.v2 v2.2.8/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
|
||||
gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
|
||||
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
|
||||
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
|
||||
|
||||
@@ -23,6 +23,9 @@ type Config struct {
|
||||
|
||||
// Prompt配置
|
||||
Prompts map[string]string `yaml:"prompts"`
|
||||
|
||||
// 内容过滤配置
|
||||
SkipKeywords []string `yaml:"skip_keywords"` // 不翻译的关键词
|
||||
}
|
||||
|
||||
// ProviderConfig 厂商配置
|
||||
@@ -121,6 +124,16 @@ func (c *Config) setDefaults() {
|
||||
if c.Prompts == nil {
|
||||
c.Prompts = make(map[string]string)
|
||||
}
|
||||
|
||||
// 设置默认关键词
|
||||
if c.SkipKeywords == nil {
|
||||
c.SkipKeywords = []string{
|
||||
"TODO", "FIXME", "HACK", "XXX", "NOTE",
|
||||
"BUG", "WARN", "IMPORTANT", "TODO:",
|
||||
"FIXME:", "HACK:", "XXX:", "NOTE:",
|
||||
"BUG:", "WARN:", "IMPORTANT:",
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// GetProviderConfig 获取指定厂商的配置
|
||||
|
||||
17
internal/content/content.go
Normal file
17
internal/content/content.go
Normal file
@@ -0,0 +1,17 @@
|
||||
package content
|
||||
|
||||
import (
|
||||
"github.com/go-enry/go-enry/v2"
|
||||
)
|
||||
|
||||
const (
|
||||
Version = "1.0.0"
|
||||
)
|
||||
|
||||
func DetectLanguage(text string) string {
|
||||
return enry.GetLanguage("", []byte(text))
|
||||
}
|
||||
|
||||
func Filter(text string) string {
|
||||
return FilterBasic(text, nil)
|
||||
}
|
||||
55
internal/content/filter.go
Normal file
55
internal/content/filter.go
Normal file
@@ -0,0 +1,55 @@
|
||||
package content
|
||||
|
||||
import (
|
||||
"regexp"
|
||||
"strings"
|
||||
)
|
||||
|
||||
type FilterOptions struct {
|
||||
RemoveControlChars bool
|
||||
NormalizeLineBreaks bool
|
||||
MaxConsecutiveSymbols int
|
||||
}
|
||||
|
||||
var defaultFilterOptions = &FilterOptions{
|
||||
RemoveControlChars: true,
|
||||
NormalizeLineBreaks: true,
|
||||
MaxConsecutiveSymbols: 20,
|
||||
}
|
||||
|
||||
var controlCharsRegex = regexp.MustCompile(`[\x00-\x08\x0b\x0c\x0e-\x1f\x7f]`)
|
||||
|
||||
func FilterBasic(text string, opts *FilterOptions) string {
|
||||
if opts == nil {
|
||||
opts = defaultFilterOptions
|
||||
}
|
||||
|
||||
result := text
|
||||
|
||||
if opts.RemoveControlChars {
|
||||
result = controlCharsRegex.ReplaceAllString(result, "")
|
||||
}
|
||||
|
||||
if opts.NormalizeLineBreaks {
|
||||
result = strings.ReplaceAll(result, "\r\n", "\n")
|
||||
result = strings.ReplaceAll(result, "\r", "\n")
|
||||
}
|
||||
|
||||
if opts.MaxConsecutiveSymbols > 0 {
|
||||
result = truncateConsecutiveSymbols(result, opts.MaxConsecutiveSymbols)
|
||||
}
|
||||
|
||||
return result
|
||||
}
|
||||
|
||||
func truncateConsecutiveSymbols(text string, maxCount int) string {
|
||||
symbols := []string{"=", "-", "_", "*", "#", "~", "`", "."}
|
||||
|
||||
for _, symbol := range symbols {
|
||||
pattern := regexp.MustCompile(`(?` + `(` + symbol + `){` + string(rune(maxCount+1)) + `,})`)
|
||||
replacement := strings.Repeat(symbol, maxCount)
|
||||
text = pattern.ReplaceAllString(text, replacement)
|
||||
}
|
||||
|
||||
return text
|
||||
}
|
||||
453
internal/content/parser.go
Normal file
453
internal/content/parser.go
Normal file
@@ -0,0 +1,453 @@
|
||||
package content
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"regexp"
|
||||
"strings"
|
||||
|
||||
"github.com/go-enry/go-enry/v2"
|
||||
)
|
||||
|
||||
type SegmentType int
|
||||
|
||||
const (
|
||||
SegmentTypeText SegmentType = iota
|
||||
SegmentTypeCodeBlock
|
||||
SegmentTypeInlineCode
|
||||
SegmentTypeComment
|
||||
)
|
||||
|
||||
func (t SegmentType) String() string {
|
||||
switch t {
|
||||
case SegmentTypeText:
|
||||
return "text"
|
||||
case SegmentTypeCodeBlock:
|
||||
return "code_block"
|
||||
case SegmentTypeInlineCode:
|
||||
return "inline_code"
|
||||
case SegmentTypeComment:
|
||||
return "comment"
|
||||
default:
|
||||
return "unknown"
|
||||
}
|
||||
}
|
||||
|
||||
type ContentSegment struct {
|
||||
Type SegmentType
|
||||
Content string
|
||||
Translated string
|
||||
Language string
|
||||
IsComment bool
|
||||
StartPos int
|
||||
EndPos int
|
||||
}
|
||||
|
||||
type ParseResult struct {
|
||||
Segments []ContentSegment
|
||||
SourceLang string
|
||||
HasCode bool
|
||||
}
|
||||
|
||||
type languageCommentPatterns struct {
|
||||
LineComment string
|
||||
BlockComment []string
|
||||
}
|
||||
|
||||
var languagePatterns = map[string]languageCommentPatterns{
|
||||
"javascript": {LineComment: `//`, BlockComment: []string{`/*`, `*/`}},
|
||||
"typescript": {LineComment: `//`, BlockComment: []string{`/*`, `*/`}},
|
||||
"java": {LineComment: `//`, BlockComment: []string{`/*`, `*/`}},
|
||||
"kotlin": {LineComment: `//`, BlockComment: []string{`/*`, `*/`}},
|
||||
"scala": {LineComment: `//`, BlockComment: []string{`/*`, `*/`}},
|
||||
"c": {LineComment: `//`, BlockComment: []string{`/*`, `*/`}},
|
||||
"cpp": {LineComment: `//`, BlockComment: []string{`/*`, `*/`}},
|
||||
"c#": {LineComment: `//`, BlockComment: []string{`/*`, `*/`}},
|
||||
"go": {LineComment: `//`, BlockComment: []string{`/*`, `*/`}},
|
||||
"rust": {LineComment: `//`, BlockComment: []string{`/*`, `*/`}},
|
||||
"php": {LineComment: `//`, BlockComment: []string{`/*`, `*/`}},
|
||||
"swift": {LineComment: `//`, BlockComment: []string{`/*`, `*/`}},
|
||||
"objective-c": {LineComment: `//`, BlockComment: []string{`/*`, `*/`}},
|
||||
"scss": {LineComment: `//`, BlockComment: []string{`/*`, `*/`}},
|
||||
"css": {LineComment: ``, BlockComment: []string{`/*`, `*/`}},
|
||||
"less": {LineComment: ``, BlockComment: []string{`/*`, `*/`}},
|
||||
"html": {LineComment: ``, BlockComment: []string{`<!--`, `-->`}},
|
||||
"xml": {LineComment: ``, BlockComment: []string{`<!--`, `-->`}},
|
||||
"sql": {LineComment: `--`, BlockComment: []string{`/*`, `*/`}},
|
||||
"python": {LineComment: `#`, BlockComment: []string{`"""`, `"""`}},
|
||||
"ruby": {LineComment: `#`, BlockComment: []string{`=begin`, `=end`}},
|
||||
"shell": {LineComment: `#`, BlockComment: []string{}},
|
||||
"bash": {LineComment: `#`, BlockComment: []string{}},
|
||||
"powershell": {LineComment: `#`, BlockComment: []string{`<#`, `#>`}},
|
||||
"yaml": {LineComment: `#()`, BlockComment: []string{}},
|
||||
"json": {LineComment: ``, BlockComment: []string{}},
|
||||
"markdown": {LineComment: ``, BlockComment: []string{}},
|
||||
"vue": {LineComment: `//()`, BlockComment: []string{`/*`, `*/`, `<!--`, `-->`}},
|
||||
"svelte": {LineComment: `//()`, BlockComment: []string{`/*`, `*/`}},
|
||||
"jsx": {LineComment: `//()`, BlockComment: []string{`/*`, `*/`}},
|
||||
"tsx": {LineComment: `//()`, BlockComment: []string{`/*`, `*/`}},
|
||||
}
|
||||
|
||||
var defaultPatterns = languageCommentPatterns{
|
||||
LineComment: `//`,
|
||||
BlockComment: []string{`/*`, `*/`},
|
||||
}
|
||||
|
||||
type Parser struct {
|
||||
skipKeywords []string
|
||||
fallbackLang string
|
||||
}
|
||||
|
||||
func NewParser(skipKeywords []string) *Parser {
|
||||
if skipKeywords == nil {
|
||||
skipKeywords = []string{
|
||||
"TODO", "FIXME", "HACK", "XXX", "NOTE",
|
||||
"BUG", "WARN", "IMPORTANT", "TODO:",
|
||||
"FIXME:", "HACK:", "XXX:", "NOTE:",
|
||||
"BUG:", "WARN:", "IMPORTANT:",
|
||||
}
|
||||
}
|
||||
return &Parser{
|
||||
skipKeywords: skipKeywords,
|
||||
fallbackLang: "javascript",
|
||||
}
|
||||
}
|
||||
|
||||
func (p *Parser) Parse(text string) (*ParseResult, error) {
|
||||
result := &ParseResult{
|
||||
Segments: []ContentSegment{},
|
||||
}
|
||||
|
||||
detectedLang := p.detectLanguage(text)
|
||||
result.SourceLang = detectedLang
|
||||
|
||||
segments := p.splitIntoSegments(text, result.SourceLang)
|
||||
|
||||
for _, seg := range segments {
|
||||
if seg.Type == SegmentTypeCodeBlock || seg.Type == SegmentTypeInlineCode {
|
||||
result.HasCode = true
|
||||
}
|
||||
result.Segments = append(result.Segments, seg)
|
||||
}
|
||||
|
||||
return result, nil
|
||||
}
|
||||
|
||||
func (p *Parser) detectLanguage(text string) string {
|
||||
lines := strings.Split(text, "\n")
|
||||
var codeLines []string
|
||||
inCodeBlock := false
|
||||
|
||||
for _, line := range lines {
|
||||
trimmed := strings.TrimSpace(line)
|
||||
if strings.HasPrefix(trimmed, "```") {
|
||||
inCodeBlock = !inCodeBlock
|
||||
continue
|
||||
}
|
||||
if inCodeBlock && trimmed != "" {
|
||||
codeLines = append(codeLines, trimmed)
|
||||
}
|
||||
}
|
||||
|
||||
if len(codeLines) == 0 {
|
||||
for _, line := range lines {
|
||||
if strings.TrimSpace(line) != "" {
|
||||
codeLines = append(codeLines, line)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if len(codeLines) == 0 {
|
||||
return p.fallbackLang
|
||||
}
|
||||
|
||||
sample := strings.Join(codeLines[:min(len(codeLines), 10)], "\n")
|
||||
lang := enry.GetLanguage("", []byte(sample))
|
||||
|
||||
if lang == "" {
|
||||
return p.fallbackLang
|
||||
}
|
||||
|
||||
return strings.ToLower(lang)
|
||||
}
|
||||
|
||||
func (p *Parser) splitIntoSegments(text string, lang string) []ContentSegment {
|
||||
segments := []ContentSegment{}
|
||||
|
||||
codeBlockPattern := regexp.MustCompile("(?s)```[\\s\\S]*?^```|`[^`]+`")
|
||||
matches := codeBlockPattern.FindAllStringIndex(text, -1)
|
||||
|
||||
if len(matches) == 0 {
|
||||
segments = append(segments, ContentSegment{
|
||||
Type: SegmentTypeText,
|
||||
Content: text,
|
||||
StartPos: 0,
|
||||
EndPos: len(text),
|
||||
})
|
||||
return segments
|
||||
}
|
||||
|
||||
lastEnd := 0
|
||||
for _, match := range matches {
|
||||
start, end := match[0], match[1]
|
||||
|
||||
if start > lastEnd {
|
||||
textPart := text[lastEnd:start]
|
||||
textSegments := p.parseTextContent(textPart, lang)
|
||||
segments = append(segments, textSegments...)
|
||||
}
|
||||
|
||||
content := text[start:end]
|
||||
isInline := len(content) > 0 && content[0] == '`' && (len(content) == 1 || content[len(content)-1] == '`')
|
||||
|
||||
if strings.HasPrefix(content, "```") {
|
||||
segments = append(segments, ContentSegment{
|
||||
Type: SegmentTypeCodeBlock,
|
||||
Content: content,
|
||||
Language: p.detectCodeBlockLang(content),
|
||||
StartPos: start,
|
||||
EndPos: end,
|
||||
})
|
||||
} else if isInline {
|
||||
segments = append(segments, ContentSegment{
|
||||
Type: SegmentTypeInlineCode,
|
||||
Content: content,
|
||||
Language: lang,
|
||||
StartPos: start,
|
||||
EndPos: end,
|
||||
})
|
||||
}
|
||||
|
||||
lastEnd = end
|
||||
}
|
||||
|
||||
if lastEnd < len(text) {
|
||||
textPart := text[lastEnd:]
|
||||
textSegments := p.parseTextContent(textPart, lang)
|
||||
segments = append(segments, textSegments...)
|
||||
}
|
||||
|
||||
return segments
|
||||
}
|
||||
|
||||
func (p *Parser) parseTextContent(text string, lang string) []ContentSegment {
|
||||
segments := []ContentSegment{}
|
||||
langPatterns := getLanguagePatterns(lang)
|
||||
|
||||
if langPatterns.SingleLine == "" && len(langPatterns.MultiLine) == 0 {
|
||||
segments = append(segments, ContentSegment{
|
||||
Type: SegmentTypeText,
|
||||
Content: text,
|
||||
Language: lang,
|
||||
StartPos: 0,
|
||||
EndPos: len(text),
|
||||
})
|
||||
return segments
|
||||
}
|
||||
|
||||
commentPatterns := p.buildCommentRegex(langPatterns)
|
||||
if commentPatterns == nil {
|
||||
segments = append(segments, ContentSegment{
|
||||
Type: SegmentTypeText,
|
||||
Content: text,
|
||||
Language: lang,
|
||||
StartPos: 0,
|
||||
EndPos: len(text),
|
||||
})
|
||||
return segments
|
||||
}
|
||||
|
||||
matches := commentPatterns.FindAllStringIndex(text, -1)
|
||||
if len(matches) == 0 {
|
||||
segments = append(segments, ContentSegment{
|
||||
Type: SegmentTypeText,
|
||||
Content: text,
|
||||
Language: lang,
|
||||
StartPos: 0,
|
||||
EndPos: len(text),
|
||||
})
|
||||
return segments
|
||||
}
|
||||
|
||||
lastEnd := 0
|
||||
for _, match := range matches {
|
||||
start, end := match[0], match[1]
|
||||
|
||||
if start > lastEnd {
|
||||
segments = append(segments, ContentSegment{
|
||||
Type: SegmentTypeText,
|
||||
Content: text[lastEnd:start],
|
||||
Language: lang,
|
||||
StartPos: lastEnd,
|
||||
EndPos: start,
|
||||
})
|
||||
}
|
||||
|
||||
segments = append(segments, ContentSegment{
|
||||
Type: SegmentTypeComment,
|
||||
Content: text[start:end],
|
||||
IsComment: true,
|
||||
Language: lang,
|
||||
StartPos: start,
|
||||
EndPos: end,
|
||||
})
|
||||
|
||||
lastEnd = end
|
||||
}
|
||||
|
||||
if lastEnd < len(text) {
|
||||
segments = append(segments, ContentSegment{
|
||||
Type: SegmentTypeText,
|
||||
Content: text[lastEnd:],
|
||||
Language: lang,
|
||||
StartPos: lastEnd,
|
||||
EndPos: len(text),
|
||||
})
|
||||
}
|
||||
|
||||
return segments
|
||||
}
|
||||
|
||||
type languageCommentRegex struct {
|
||||
SingleLine string
|
||||
MultiLine []struct {
|
||||
Start string
|
||||
End string
|
||||
}
|
||||
}
|
||||
|
||||
func (p *Parser) buildCommentRegex(patterns languageCommentRegex) *regexp.Regexp {
|
||||
var parts []string
|
||||
|
||||
if patterns.SingleLine != "" {
|
||||
parts = append(parts, patterns.SingleLine+`.*$`)
|
||||
}
|
||||
|
||||
for _, multi := range patterns.MultiLine {
|
||||
if multi.Start != "" && multi.End != "" {
|
||||
escapedStart := regexp.QuoteMeta(multi.Start)
|
||||
escapedEnd := regexp.QuoteMeta(multi.End)
|
||||
parts = append(parts, escapedStart+`[\s\S]*?`+escapedEnd)
|
||||
}
|
||||
}
|
||||
|
||||
if len(parts) == 0 {
|
||||
return nil
|
||||
}
|
||||
|
||||
pattern := `(?m)` + strings.Join(parts, "|")
|
||||
return regexp.MustCompile(pattern)
|
||||
}
|
||||
|
||||
func getLanguagePatterns(lang string) languageCommentRegex {
|
||||
patterns, ok := languagePatterns[lang]
|
||||
if !ok {
|
||||
patterns = defaultPatterns
|
||||
}
|
||||
|
||||
result := languageCommentRegex{
|
||||
SingleLine: patterns.LineComment,
|
||||
}
|
||||
|
||||
for _, bc := range patterns.BlockComment {
|
||||
if len(bc) >= 2 {
|
||||
result.MultiLine = append(result.MultiLine, struct {
|
||||
Start string
|
||||
End string
|
||||
}{Start: bc[:len(bc)/2], End: bc[len(bc)/2:]})
|
||||
}
|
||||
}
|
||||
|
||||
return result
|
||||
}
|
||||
|
||||
func (p *Parser) detectCodeBlockLang(codeBlock string) string {
|
||||
lines := strings.Split(codeBlock, "\n")
|
||||
if len(lines) < 2 {
|
||||
return ""
|
||||
}
|
||||
|
||||
firstLine := strings.TrimSpace(lines[0])
|
||||
firstLine = strings.TrimPrefix(firstLine, "```")
|
||||
firstLine = strings.TrimSpace(firstLine)
|
||||
|
||||
if firstLine != "" {
|
||||
lang := strings.ToLower(firstLine)
|
||||
if _, ok := languagePatterns[lang]; ok {
|
||||
return lang
|
||||
}
|
||||
}
|
||||
|
||||
return ""
|
||||
}
|
||||
|
||||
func (p *Parser) BuildPrompt(result *ParseResult) string {
|
||||
var prompt strings.Builder
|
||||
|
||||
prompt.WriteString("你是一位专业的技术翻译。请翻译以下内容,遵守以下规则:\n\n")
|
||||
|
||||
prompt.WriteString("需要翻译的部分:\n")
|
||||
prompt.WriteString("- 普通文本:翻译成目标语言\n")
|
||||
prompt.WriteString("- 代码注释:只翻译注释中有意义的词汇,技术术语保留原语言\n\n")
|
||||
|
||||
prompt.WriteString("需要保持不变的部分:\n")
|
||||
prompt.WriteString("- 代码块(如 ```javascript ... ```)保持原样\n")
|
||||
prompt.WriteString("- 行内代码(如 `const count = 10`)保持原样\n")
|
||||
|
||||
if len(p.skipKeywords) > 0 {
|
||||
prompt.WriteString(fmt.Sprintf("- 以下关键词不翻译:%s\n", strings.Join(p.skipKeywords, "、")))
|
||||
}
|
||||
|
||||
prompt.WriteString("\n请将需要翻译的部分翻译成中文,其他部分保持不变。\n\n")
|
||||
prompt.WriteString("原文:\n---\n")
|
||||
|
||||
textToTranslate := p.extractTextForTranslation(result)
|
||||
prompt.WriteString(textToTranslate)
|
||||
|
||||
prompt.WriteString("\n---")
|
||||
|
||||
return prompt.String()
|
||||
}
|
||||
|
||||
func (p *Parser) extractTextForTranslation(result *ParseResult) string {
|
||||
var text strings.Builder
|
||||
|
||||
for _, seg := range result.Segments {
|
||||
switch seg.Type {
|
||||
case SegmentTypeText:
|
||||
text.WriteString(seg.Content)
|
||||
case SegmentTypeComment:
|
||||
text.WriteString(seg.Content)
|
||||
case SegmentTypeCodeBlock, SegmentTypeInlineCode:
|
||||
}
|
||||
}
|
||||
|
||||
return text.String()
|
||||
}
|
||||
|
||||
func (p *Parser) Reconstruct(result *ParseResult, translatedText string) string {
|
||||
translatedLines := strings.Split(translatedText, "\n")
|
||||
var output strings.Builder
|
||||
|
||||
textIndex := 0
|
||||
|
||||
for _, seg := range result.Segments {
|
||||
switch seg.Type {
|
||||
case SegmentTypeText, SegmentTypeComment:
|
||||
if textIndex < len(translatedLines) {
|
||||
output.WriteString(translatedLines[textIndex])
|
||||
textIndex++
|
||||
}
|
||||
case SegmentTypeCodeBlock, SegmentTypeInlineCode:
|
||||
output.WriteString(seg.Content)
|
||||
}
|
||||
}
|
||||
|
||||
return output.String()
|
||||
}
|
||||
|
||||
func min(a, b int) int {
|
||||
if a < b {
|
||||
return a
|
||||
}
|
||||
return b
|
||||
}
|
||||
@@ -6,6 +6,7 @@ import (
|
||||
"time"
|
||||
|
||||
"github.com/titor/fanyi/internal/config"
|
||||
"github.com/titor/fanyi/internal/content"
|
||||
"github.com/titor/fanyi/internal/provider"
|
||||
)
|
||||
|
||||
@@ -14,6 +15,7 @@ type Translator struct {
|
||||
config *config.Config
|
||||
provider provider.Provider
|
||||
prompt *PromptManager
|
||||
contentParser *content.Parser
|
||||
}
|
||||
|
||||
// NewTranslator 创建翻译器实例
|
||||
@@ -22,6 +24,7 @@ func NewTranslator(config *config.Config, provider provider.Provider) *Translato
|
||||
config: config,
|
||||
provider: provider,
|
||||
prompt: NewPromptManager(config.Prompts),
|
||||
contentParser: content.NewParser(config.SkipKeywords),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -31,15 +34,33 @@ func (t *Translator) Translate(ctx context.Context, text string, options *Transl
|
||||
timeoutCtx, cancel := context.WithTimeout(ctx, time.Duration(t.config.Timeout)*time.Second)
|
||||
defer cancel()
|
||||
|
||||
// 基础字符过滤
|
||||
filteredText := content.FilterBasic(text, nil)
|
||||
|
||||
// 内容解析(包含代码检测)
|
||||
parseResult, parseErr := t.contentParser.Parse(filteredText)
|
||||
|
||||
// 选择Prompt
|
||||
prompt := ""
|
||||
if options.PromptName != "" {
|
||||
prompt = t.prompt.GetPrompt(options.PromptName)
|
||||
}
|
||||
|
||||
// 如果包含代码且解析成功,使用增强的Prompt
|
||||
if parseErr == nil && parseResult.HasCode {
|
||||
enhancedPrompt := t.contentParser.BuildPrompt(parseResult)
|
||||
if enhancedPrompt != "" {
|
||||
if prompt != "" {
|
||||
prompt = prompt + "\n\n" + enhancedPrompt
|
||||
} else {
|
||||
prompt = enhancedPrompt
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// 构建请求
|
||||
req := &provider.TranslateRequest{
|
||||
Text: text,
|
||||
Text: filteredText,
|
||||
FromLang: options.FromLang,
|
||||
ToLang: options.ToLang,
|
||||
Prompt: prompt,
|
||||
@@ -53,10 +74,17 @@ func (t *Translator) Translate(ctx context.Context, text string, options *Transl
|
||||
return nil, fmt.Errorf("翻译失败: %w", err)
|
||||
}
|
||||
|
||||
translatedText := resp.Text
|
||||
|
||||
// 如果包含代码且解析成功,重构结果
|
||||
if parseErr == nil && parseResult.HasCode {
|
||||
translatedText = t.contentParser.Reconstruct(parseResult, resp.Text)
|
||||
}
|
||||
|
||||
// 构建结果
|
||||
return &TranslateResult{
|
||||
Original: text,
|
||||
Translated: resp.Text,
|
||||
Translated: translatedText,
|
||||
FromLang: resp.FromLang,
|
||||
ToLang: resp.ToLang,
|
||||
Model: resp.Model,
|
||||
|
||||
Reference in New Issue
Block a user