454 lines
11 KiB
Go
454 lines
11 KiB
Go
|
|
package content
|
|||
|
|
|
|||
|
|
import (
|
|||
|
|
"fmt"
|
|||
|
|
"regexp"
|
|||
|
|
"strings"
|
|||
|
|
|
|||
|
|
"github.com/go-enry/go-enry/v2"
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
type SegmentType int
|
|||
|
|
|
|||
|
|
const (
|
|||
|
|
SegmentTypeText SegmentType = iota
|
|||
|
|
SegmentTypeCodeBlock
|
|||
|
|
SegmentTypeInlineCode
|
|||
|
|
SegmentTypeComment
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
func (t SegmentType) String() string {
|
|||
|
|
switch t {
|
|||
|
|
case SegmentTypeText:
|
|||
|
|
return "text"
|
|||
|
|
case SegmentTypeCodeBlock:
|
|||
|
|
return "code_block"
|
|||
|
|
case SegmentTypeInlineCode:
|
|||
|
|
return "inline_code"
|
|||
|
|
case SegmentTypeComment:
|
|||
|
|
return "comment"
|
|||
|
|
default:
|
|||
|
|
return "unknown"
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
type ContentSegment struct {
|
|||
|
|
Type SegmentType
|
|||
|
|
Content string
|
|||
|
|
Translated string
|
|||
|
|
Language string
|
|||
|
|
IsComment bool
|
|||
|
|
StartPos int
|
|||
|
|
EndPos int
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
type ParseResult struct {
|
|||
|
|
Segments []ContentSegment
|
|||
|
|
SourceLang string
|
|||
|
|
HasCode bool
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
type languageCommentPatterns struct {
|
|||
|
|
LineComment string
|
|||
|
|
BlockComment []string
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
var languagePatterns = map[string]languageCommentPatterns{
|
|||
|
|
"javascript": {LineComment: `//`, BlockComment: []string{`/*`, `*/`}},
|
|||
|
|
"typescript": {LineComment: `//`, BlockComment: []string{`/*`, `*/`}},
|
|||
|
|
"java": {LineComment: `//`, BlockComment: []string{`/*`, `*/`}},
|
|||
|
|
"kotlin": {LineComment: `//`, BlockComment: []string{`/*`, `*/`}},
|
|||
|
|
"scala": {LineComment: `//`, BlockComment: []string{`/*`, `*/`}},
|
|||
|
|
"c": {LineComment: `//`, BlockComment: []string{`/*`, `*/`}},
|
|||
|
|
"cpp": {LineComment: `//`, BlockComment: []string{`/*`, `*/`}},
|
|||
|
|
"c#": {LineComment: `//`, BlockComment: []string{`/*`, `*/`}},
|
|||
|
|
"go": {LineComment: `//`, BlockComment: []string{`/*`, `*/`}},
|
|||
|
|
"rust": {LineComment: `//`, BlockComment: []string{`/*`, `*/`}},
|
|||
|
|
"php": {LineComment: `//`, BlockComment: []string{`/*`, `*/`}},
|
|||
|
|
"swift": {LineComment: `//`, BlockComment: []string{`/*`, `*/`}},
|
|||
|
|
"objective-c": {LineComment: `//`, BlockComment: []string{`/*`, `*/`}},
|
|||
|
|
"scss": {LineComment: `//`, BlockComment: []string{`/*`, `*/`}},
|
|||
|
|
"css": {LineComment: ``, BlockComment: []string{`/*`, `*/`}},
|
|||
|
|
"less": {LineComment: ``, BlockComment: []string{`/*`, `*/`}},
|
|||
|
|
"html": {LineComment: ``, BlockComment: []string{`<!--`, `-->`}},
|
|||
|
|
"xml": {LineComment: ``, BlockComment: []string{`<!--`, `-->`}},
|
|||
|
|
"sql": {LineComment: `--`, BlockComment: []string{`/*`, `*/`}},
|
|||
|
|
"python": {LineComment: `#`, BlockComment: []string{`"""`, `"""`}},
|
|||
|
|
"ruby": {LineComment: `#`, BlockComment: []string{`=begin`, `=end`}},
|
|||
|
|
"shell": {LineComment: `#`, BlockComment: []string{}},
|
|||
|
|
"bash": {LineComment: `#`, BlockComment: []string{}},
|
|||
|
|
"powershell": {LineComment: `#`, BlockComment: []string{`<#`, `#>`}},
|
|||
|
|
"yaml": {LineComment: `#()`, BlockComment: []string{}},
|
|||
|
|
"json": {LineComment: ``, BlockComment: []string{}},
|
|||
|
|
"markdown": {LineComment: ``, BlockComment: []string{}},
|
|||
|
|
"vue": {LineComment: `//()`, BlockComment: []string{`/*`, `*/`, `<!--`, `-->`}},
|
|||
|
|
"svelte": {LineComment: `//()`, BlockComment: []string{`/*`, `*/`}},
|
|||
|
|
"jsx": {LineComment: `//()`, BlockComment: []string{`/*`, `*/`}},
|
|||
|
|
"tsx": {LineComment: `//()`, BlockComment: []string{`/*`, `*/`}},
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
var defaultPatterns = languageCommentPatterns{
|
|||
|
|
LineComment: `//`,
|
|||
|
|
BlockComment: []string{`/*`, `*/`},
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
type Parser struct {
|
|||
|
|
skipKeywords []string
|
|||
|
|
fallbackLang string
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
func NewParser(skipKeywords []string) *Parser {
|
|||
|
|
if skipKeywords == nil {
|
|||
|
|
skipKeywords = []string{
|
|||
|
|
"TODO", "FIXME", "HACK", "XXX", "NOTE",
|
|||
|
|
"BUG", "WARN", "IMPORTANT", "TODO:",
|
|||
|
|
"FIXME:", "HACK:", "XXX:", "NOTE:",
|
|||
|
|
"BUG:", "WARN:", "IMPORTANT:",
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
return &Parser{
|
|||
|
|
skipKeywords: skipKeywords,
|
|||
|
|
fallbackLang: "javascript",
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
func (p *Parser) Parse(text string) (*ParseResult, error) {
|
|||
|
|
result := &ParseResult{
|
|||
|
|
Segments: []ContentSegment{},
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
detectedLang := p.detectLanguage(text)
|
|||
|
|
result.SourceLang = detectedLang
|
|||
|
|
|
|||
|
|
segments := p.splitIntoSegments(text, result.SourceLang)
|
|||
|
|
|
|||
|
|
for _, seg := range segments {
|
|||
|
|
if seg.Type == SegmentTypeCodeBlock || seg.Type == SegmentTypeInlineCode {
|
|||
|
|
result.HasCode = true
|
|||
|
|
}
|
|||
|
|
result.Segments = append(result.Segments, seg)
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
return result, nil
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
func (p *Parser) detectLanguage(text string) string {
|
|||
|
|
lines := strings.Split(text, "\n")
|
|||
|
|
var codeLines []string
|
|||
|
|
inCodeBlock := false
|
|||
|
|
|
|||
|
|
for _, line := range lines {
|
|||
|
|
trimmed := strings.TrimSpace(line)
|
|||
|
|
if strings.HasPrefix(trimmed, "```") {
|
|||
|
|
inCodeBlock = !inCodeBlock
|
|||
|
|
continue
|
|||
|
|
}
|
|||
|
|
if inCodeBlock && trimmed != "" {
|
|||
|
|
codeLines = append(codeLines, trimmed)
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
if len(codeLines) == 0 {
|
|||
|
|
for _, line := range lines {
|
|||
|
|
if strings.TrimSpace(line) != "" {
|
|||
|
|
codeLines = append(codeLines, line)
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
if len(codeLines) == 0 {
|
|||
|
|
return p.fallbackLang
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
sample := strings.Join(codeLines[:min(len(codeLines), 10)], "\n")
|
|||
|
|
lang := enry.GetLanguage("", []byte(sample))
|
|||
|
|
|
|||
|
|
if lang == "" {
|
|||
|
|
return p.fallbackLang
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
return strings.ToLower(lang)
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
func (p *Parser) splitIntoSegments(text string, lang string) []ContentSegment {
|
|||
|
|
segments := []ContentSegment{}
|
|||
|
|
|
|||
|
|
codeBlockPattern := regexp.MustCompile("(?s)```[\\s\\S]*?^```|`[^`]+`")
|
|||
|
|
matches := codeBlockPattern.FindAllStringIndex(text, -1)
|
|||
|
|
|
|||
|
|
if len(matches) == 0 {
|
|||
|
|
segments = append(segments, ContentSegment{
|
|||
|
|
Type: SegmentTypeText,
|
|||
|
|
Content: text,
|
|||
|
|
StartPos: 0,
|
|||
|
|
EndPos: len(text),
|
|||
|
|
})
|
|||
|
|
return segments
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
lastEnd := 0
|
|||
|
|
for _, match := range matches {
|
|||
|
|
start, end := match[0], match[1]
|
|||
|
|
|
|||
|
|
if start > lastEnd {
|
|||
|
|
textPart := text[lastEnd:start]
|
|||
|
|
textSegments := p.parseTextContent(textPart, lang)
|
|||
|
|
segments = append(segments, textSegments...)
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
content := text[start:end]
|
|||
|
|
isInline := len(content) > 0 && content[0] == '`' && (len(content) == 1 || content[len(content)-1] == '`')
|
|||
|
|
|
|||
|
|
if strings.HasPrefix(content, "```") {
|
|||
|
|
segments = append(segments, ContentSegment{
|
|||
|
|
Type: SegmentTypeCodeBlock,
|
|||
|
|
Content: content,
|
|||
|
|
Language: p.detectCodeBlockLang(content),
|
|||
|
|
StartPos: start,
|
|||
|
|
EndPos: end,
|
|||
|
|
})
|
|||
|
|
} else if isInline {
|
|||
|
|
segments = append(segments, ContentSegment{
|
|||
|
|
Type: SegmentTypeInlineCode,
|
|||
|
|
Content: content,
|
|||
|
|
Language: lang,
|
|||
|
|
StartPos: start,
|
|||
|
|
EndPos: end,
|
|||
|
|
})
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
lastEnd = end
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
if lastEnd < len(text) {
|
|||
|
|
textPart := text[lastEnd:]
|
|||
|
|
textSegments := p.parseTextContent(textPart, lang)
|
|||
|
|
segments = append(segments, textSegments...)
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
return segments
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
func (p *Parser) parseTextContent(text string, lang string) []ContentSegment {
|
|||
|
|
segments := []ContentSegment{}
|
|||
|
|
langPatterns := getLanguagePatterns(lang)
|
|||
|
|
|
|||
|
|
if langPatterns.SingleLine == "" && len(langPatterns.MultiLine) == 0 {
|
|||
|
|
segments = append(segments, ContentSegment{
|
|||
|
|
Type: SegmentTypeText,
|
|||
|
|
Content: text,
|
|||
|
|
Language: lang,
|
|||
|
|
StartPos: 0,
|
|||
|
|
EndPos: len(text),
|
|||
|
|
})
|
|||
|
|
return segments
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
commentPatterns := p.buildCommentRegex(langPatterns)
|
|||
|
|
if commentPatterns == nil {
|
|||
|
|
segments = append(segments, ContentSegment{
|
|||
|
|
Type: SegmentTypeText,
|
|||
|
|
Content: text,
|
|||
|
|
Language: lang,
|
|||
|
|
StartPos: 0,
|
|||
|
|
EndPos: len(text),
|
|||
|
|
})
|
|||
|
|
return segments
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
matches := commentPatterns.FindAllStringIndex(text, -1)
|
|||
|
|
if len(matches) == 0 {
|
|||
|
|
segments = append(segments, ContentSegment{
|
|||
|
|
Type: SegmentTypeText,
|
|||
|
|
Content: text,
|
|||
|
|
Language: lang,
|
|||
|
|
StartPos: 0,
|
|||
|
|
EndPos: len(text),
|
|||
|
|
})
|
|||
|
|
return segments
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
lastEnd := 0
|
|||
|
|
for _, match := range matches {
|
|||
|
|
start, end := match[0], match[1]
|
|||
|
|
|
|||
|
|
if start > lastEnd {
|
|||
|
|
segments = append(segments, ContentSegment{
|
|||
|
|
Type: SegmentTypeText,
|
|||
|
|
Content: text[lastEnd:start],
|
|||
|
|
Language: lang,
|
|||
|
|
StartPos: lastEnd,
|
|||
|
|
EndPos: start,
|
|||
|
|
})
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
segments = append(segments, ContentSegment{
|
|||
|
|
Type: SegmentTypeComment,
|
|||
|
|
Content: text[start:end],
|
|||
|
|
IsComment: true,
|
|||
|
|
Language: lang,
|
|||
|
|
StartPos: start,
|
|||
|
|
EndPos: end,
|
|||
|
|
})
|
|||
|
|
|
|||
|
|
lastEnd = end
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
if lastEnd < len(text) {
|
|||
|
|
segments = append(segments, ContentSegment{
|
|||
|
|
Type: SegmentTypeText,
|
|||
|
|
Content: text[lastEnd:],
|
|||
|
|
Language: lang,
|
|||
|
|
StartPos: lastEnd,
|
|||
|
|
EndPos: len(text),
|
|||
|
|
})
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
return segments
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
type languageCommentRegex struct {
|
|||
|
|
SingleLine string
|
|||
|
|
MultiLine []struct {
|
|||
|
|
Start string
|
|||
|
|
End string
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
func (p *Parser) buildCommentRegex(patterns languageCommentRegex) *regexp.Regexp {
|
|||
|
|
var parts []string
|
|||
|
|
|
|||
|
|
if patterns.SingleLine != "" {
|
|||
|
|
parts = append(parts, patterns.SingleLine+`.*$`)
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
for _, multi := range patterns.MultiLine {
|
|||
|
|
if multi.Start != "" && multi.End != "" {
|
|||
|
|
escapedStart := regexp.QuoteMeta(multi.Start)
|
|||
|
|
escapedEnd := regexp.QuoteMeta(multi.End)
|
|||
|
|
parts = append(parts, escapedStart+`[\s\S]*?`+escapedEnd)
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
if len(parts) == 0 {
|
|||
|
|
return nil
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
pattern := `(?m)` + strings.Join(parts, "|")
|
|||
|
|
return regexp.MustCompile(pattern)
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
func getLanguagePatterns(lang string) languageCommentRegex {
|
|||
|
|
patterns, ok := languagePatterns[lang]
|
|||
|
|
if !ok {
|
|||
|
|
patterns = defaultPatterns
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
result := languageCommentRegex{
|
|||
|
|
SingleLine: patterns.LineComment,
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
for _, bc := range patterns.BlockComment {
|
|||
|
|
if len(bc) >= 2 {
|
|||
|
|
result.MultiLine = append(result.MultiLine, struct {
|
|||
|
|
Start string
|
|||
|
|
End string
|
|||
|
|
}{Start: bc[:len(bc)/2], End: bc[len(bc)/2:]})
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
return result
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
func (p *Parser) detectCodeBlockLang(codeBlock string) string {
|
|||
|
|
lines := strings.Split(codeBlock, "\n")
|
|||
|
|
if len(lines) < 2 {
|
|||
|
|
return ""
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
firstLine := strings.TrimSpace(lines[0])
|
|||
|
|
firstLine = strings.TrimPrefix(firstLine, "```")
|
|||
|
|
firstLine = strings.TrimSpace(firstLine)
|
|||
|
|
|
|||
|
|
if firstLine != "" {
|
|||
|
|
lang := strings.ToLower(firstLine)
|
|||
|
|
if _, ok := languagePatterns[lang]; ok {
|
|||
|
|
return lang
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
return ""
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
func (p *Parser) BuildPrompt(result *ParseResult) string {
|
|||
|
|
var prompt strings.Builder
|
|||
|
|
|
|||
|
|
prompt.WriteString("你是一位专业的技术翻译。请翻译以下内容,遵守以下规则:\n\n")
|
|||
|
|
|
|||
|
|
prompt.WriteString("需要翻译的部分:\n")
|
|||
|
|
prompt.WriteString("- 普通文本:翻译成目标语言\n")
|
|||
|
|
prompt.WriteString("- 代码注释:只翻译注释中有意义的词汇,技术术语保留原语言\n\n")
|
|||
|
|
|
|||
|
|
prompt.WriteString("需要保持不变的部分:\n")
|
|||
|
|
prompt.WriteString("- 代码块(如 ```javascript ... ```)保持原样\n")
|
|||
|
|
prompt.WriteString("- 行内代码(如 `const count = 10`)保持原样\n")
|
|||
|
|
|
|||
|
|
if len(p.skipKeywords) > 0 {
|
|||
|
|
prompt.WriteString(fmt.Sprintf("- 以下关键词不翻译:%s\n", strings.Join(p.skipKeywords, "、")))
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
prompt.WriteString("\n请将需要翻译的部分翻译成中文,其他部分保持不变。\n\n")
|
|||
|
|
prompt.WriteString("原文:\n---\n")
|
|||
|
|
|
|||
|
|
textToTranslate := p.extractTextForTranslation(result)
|
|||
|
|
prompt.WriteString(textToTranslate)
|
|||
|
|
|
|||
|
|
prompt.WriteString("\n---")
|
|||
|
|
|
|||
|
|
return prompt.String()
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
func (p *Parser) extractTextForTranslation(result *ParseResult) string {
|
|||
|
|
var text strings.Builder
|
|||
|
|
|
|||
|
|
for _, seg := range result.Segments {
|
|||
|
|
switch seg.Type {
|
|||
|
|
case SegmentTypeText:
|
|||
|
|
text.WriteString(seg.Content)
|
|||
|
|
case SegmentTypeComment:
|
|||
|
|
text.WriteString(seg.Content)
|
|||
|
|
case SegmentTypeCodeBlock, SegmentTypeInlineCode:
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
return text.String()
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
func (p *Parser) Reconstruct(result *ParseResult, translatedText string) string {
|
|||
|
|
translatedLines := strings.Split(translatedText, "\n")
|
|||
|
|
var output strings.Builder
|
|||
|
|
|
|||
|
|
textIndex := 0
|
|||
|
|
|
|||
|
|
for _, seg := range result.Segments {
|
|||
|
|
switch seg.Type {
|
|||
|
|
case SegmentTypeText, SegmentTypeComment:
|
|||
|
|
if textIndex < len(translatedLines) {
|
|||
|
|
output.WriteString(translatedLines[textIndex])
|
|||
|
|
textIndex++
|
|||
|
|
}
|
|||
|
|
case SegmentTypeCodeBlock, SegmentTypeInlineCode:
|
|||
|
|
output.WriteString(seg.Content)
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
return output.String()
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
func min(a, b int) int {
|
|||
|
|
if a < b {
|
|||
|
|
return a
|
|||
|
|
}
|
|||
|
|
return b
|
|||
|
|
}
|