package content import ( "fmt" "regexp" "strings" "github.com/go-enry/go-enry/v2" ) type SegmentType int const ( SegmentTypeText SegmentType = iota SegmentTypeCodeBlock SegmentTypeInlineCode SegmentTypeComment ) func (t SegmentType) String() string { switch t { case SegmentTypeText: return "text" case SegmentTypeCodeBlock: return "code_block" case SegmentTypeInlineCode: return "inline_code" case SegmentTypeComment: return "comment" default: return "unknown" } } type ContentSegment struct { Type SegmentType Content string Translated string Language string IsComment bool StartPos int EndPos int } type ParseResult struct { Segments []ContentSegment SourceLang string HasCode bool } type languageCommentPatterns struct { LineComment string BlockComment []string } var languagePatterns = map[string]languageCommentPatterns{ "javascript": {LineComment: `//`, BlockComment: []string{`/*`, `*/`}}, "typescript": {LineComment: `//`, BlockComment: []string{`/*`, `*/`}}, "java": {LineComment: `//`, BlockComment: []string{`/*`, `*/`}}, "kotlin": {LineComment: `//`, BlockComment: []string{`/*`, `*/`}}, "scala": {LineComment: `//`, BlockComment: []string{`/*`, `*/`}}, "c": {LineComment: `//`, BlockComment: []string{`/*`, `*/`}}, "cpp": {LineComment: `//`, BlockComment: []string{`/*`, `*/`}}, "c#": {LineComment: `//`, BlockComment: []string{`/*`, `*/`}}, "go": {LineComment: `//`, BlockComment: []string{`/*`, `*/`}}, "rust": {LineComment: `//`, BlockComment: []string{`/*`, `*/`}}, "php": {LineComment: `//`, BlockComment: []string{`/*`, `*/`}}, "swift": {LineComment: `//`, BlockComment: []string{`/*`, `*/`}}, "objective-c": {LineComment: `//`, BlockComment: []string{`/*`, `*/`}}, "scss": {LineComment: `//`, BlockComment: []string{`/*`, `*/`}}, "css": {LineComment: ``, BlockComment: []string{`/*`, `*/`}}, "less": {LineComment: ``, BlockComment: []string{`/*`, `*/`}}, "html": {LineComment: ``, BlockComment: []string{``}}, "xml": {LineComment: ``, BlockComment: []string{``}}, "sql": {LineComment: `--`, BlockComment: []string{`/*`, `*/`}}, "python": {LineComment: `#`, BlockComment: []string{`"""`, `"""`}}, "ruby": {LineComment: `#`, BlockComment: []string{`=begin`, `=end`}}, "shell": {LineComment: `#`, BlockComment: []string{}}, "bash": {LineComment: `#`, BlockComment: []string{}}, "powershell": {LineComment: `#`, BlockComment: []string{`<#`, `#>`}}, "yaml": {LineComment: `#()`, BlockComment: []string{}}, "json": {LineComment: ``, BlockComment: []string{}}, "markdown": {LineComment: ``, BlockComment: []string{}}, "vue": {LineComment: `//()`, BlockComment: []string{`/*`, `*/`, ``}}, "svelte": {LineComment: `//()`, BlockComment: []string{`/*`, `*/`}}, "jsx": {LineComment: `//()`, BlockComment: []string{`/*`, `*/`}}, "tsx": {LineComment: `//()`, BlockComment: []string{`/*`, `*/`}}, } var defaultPatterns = languageCommentPatterns{ LineComment: `//`, BlockComment: []string{`/*`, `*/`}, } type Parser struct { skipKeywords []string fallbackLang string } func NewParser(skipKeywords []string) *Parser { if skipKeywords == nil { skipKeywords = []string{ "TODO", "FIXME", "HACK", "XXX", "NOTE", "BUG", "WARN", "IMPORTANT", "TODO:", "FIXME:", "HACK:", "XXX:", "NOTE:", "BUG:", "WARN:", "IMPORTANT:", } } return &Parser{ skipKeywords: skipKeywords, fallbackLang: "javascript", } } func (p *Parser) Parse(text string) (*ParseResult, error) { result := &ParseResult{ Segments: []ContentSegment{}, } detectedLang := p.detectLanguage(text) result.SourceLang = detectedLang segments := p.splitIntoSegments(text, result.SourceLang) for _, seg := range segments { if seg.Type == SegmentTypeCodeBlock || seg.Type == SegmentTypeInlineCode { result.HasCode = true } result.Segments = append(result.Segments, seg) } return result, nil } func (p *Parser) detectLanguage(text string) string { lines := strings.Split(text, "\n") var codeLines []string inCodeBlock := false for _, line := range lines { trimmed := strings.TrimSpace(line) if strings.HasPrefix(trimmed, "```") { inCodeBlock = !inCodeBlock continue } if inCodeBlock && trimmed != "" { codeLines = append(codeLines, trimmed) } } if len(codeLines) == 0 { for _, line := range lines { if strings.TrimSpace(line) != "" { codeLines = append(codeLines, line) } } } if len(codeLines) == 0 { return p.fallbackLang } sample := strings.Join(codeLines[:min(len(codeLines), 10)], "\n") lang := enry.GetLanguage("", []byte(sample)) if lang == "" { return p.fallbackLang } return strings.ToLower(lang) } func (p *Parser) splitIntoSegments(text string, lang string) []ContentSegment { segments := []ContentSegment{} codeBlockPattern := regexp.MustCompile("(?s)```[\\s\\S]*?^```|`[^`]+`") matches := codeBlockPattern.FindAllStringIndex(text, -1) if len(matches) == 0 { segments = append(segments, ContentSegment{ Type: SegmentTypeText, Content: text, StartPos: 0, EndPos: len(text), }) return segments } lastEnd := 0 for _, match := range matches { start, end := match[0], match[1] if start > lastEnd { textPart := text[lastEnd:start] textSegments := p.parseTextContent(textPart, lang) segments = append(segments, textSegments...) } content := text[start:end] isInline := len(content) > 0 && content[0] == '`' && (len(content) == 1 || content[len(content)-1] == '`') if strings.HasPrefix(content, "```") { segments = append(segments, ContentSegment{ Type: SegmentTypeCodeBlock, Content: content, Language: p.detectCodeBlockLang(content), StartPos: start, EndPos: end, }) } else if isInline { segments = append(segments, ContentSegment{ Type: SegmentTypeInlineCode, Content: content, Language: lang, StartPos: start, EndPos: end, }) } lastEnd = end } if lastEnd < len(text) { textPart := text[lastEnd:] textSegments := p.parseTextContent(textPart, lang) segments = append(segments, textSegments...) } return segments } func (p *Parser) parseTextContent(text string, lang string) []ContentSegment { segments := []ContentSegment{} langPatterns := getLanguagePatterns(lang) if langPatterns.SingleLine == "" && len(langPatterns.MultiLine) == 0 { segments = append(segments, ContentSegment{ Type: SegmentTypeText, Content: text, Language: lang, StartPos: 0, EndPos: len(text), }) return segments } commentPatterns := p.buildCommentRegex(langPatterns) if commentPatterns == nil { segments = append(segments, ContentSegment{ Type: SegmentTypeText, Content: text, Language: lang, StartPos: 0, EndPos: len(text), }) return segments } matches := commentPatterns.FindAllStringIndex(text, -1) if len(matches) == 0 { segments = append(segments, ContentSegment{ Type: SegmentTypeText, Content: text, Language: lang, StartPos: 0, EndPos: len(text), }) return segments } lastEnd := 0 for _, match := range matches { start, end := match[0], match[1] if start > lastEnd { segments = append(segments, ContentSegment{ Type: SegmentTypeText, Content: text[lastEnd:start], Language: lang, StartPos: lastEnd, EndPos: start, }) } segments = append(segments, ContentSegment{ Type: SegmentTypeComment, Content: text[start:end], IsComment: true, Language: lang, StartPos: start, EndPos: end, }) lastEnd = end } if lastEnd < len(text) { segments = append(segments, ContentSegment{ Type: SegmentTypeText, Content: text[lastEnd:], Language: lang, StartPos: lastEnd, EndPos: len(text), }) } return segments } type languageCommentRegex struct { SingleLine string MultiLine []struct { Start string End string } } func (p *Parser) buildCommentRegex(patterns languageCommentRegex) *regexp.Regexp { var parts []string if patterns.SingleLine != "" { parts = append(parts, patterns.SingleLine+`.*$`) } for _, multi := range patterns.MultiLine { if multi.Start != "" && multi.End != "" { escapedStart := regexp.QuoteMeta(multi.Start) escapedEnd := regexp.QuoteMeta(multi.End) parts = append(parts, escapedStart+`[\s\S]*?`+escapedEnd) } } if len(parts) == 0 { return nil } pattern := `(?m)` + strings.Join(parts, "|") return regexp.MustCompile(pattern) } func getLanguagePatterns(lang string) languageCommentRegex { patterns, ok := languagePatterns[lang] if !ok { patterns = defaultPatterns } result := languageCommentRegex{ SingleLine: patterns.LineComment, } for _, bc := range patterns.BlockComment { if len(bc) >= 2 { result.MultiLine = append(result.MultiLine, struct { Start string End string }{Start: bc[:len(bc)/2], End: bc[len(bc)/2:]}) } } return result } func (p *Parser) detectCodeBlockLang(codeBlock string) string { lines := strings.Split(codeBlock, "\n") if len(lines) < 2 { return "" } firstLine := strings.TrimSpace(lines[0]) firstLine = strings.TrimPrefix(firstLine, "```") firstLine = strings.TrimSpace(firstLine) if firstLine != "" { lang := strings.ToLower(firstLine) if _, ok := languagePatterns[lang]; ok { return lang } } return "" } func (p *Parser) BuildPrompt(result *ParseResult) string { var prompt strings.Builder prompt.WriteString("你是一位专业的技术翻译。请翻译以下内容,遵守以下规则:\n\n") prompt.WriteString("需要翻译的部分:\n") prompt.WriteString("- 普通文本:翻译成目标语言\n") prompt.WriteString("- 代码注释:只翻译注释中有意义的词汇,技术术语保留原语言\n\n") prompt.WriteString("需要保持不变的部分:\n") prompt.WriteString("- 代码块(如 ```javascript ... ```)保持原样\n") prompt.WriteString("- 行内代码(如 `const count = 10`)保持原样\n") if len(p.skipKeywords) > 0 { prompt.WriteString(fmt.Sprintf("- 以下关键词不翻译:%s\n", strings.Join(p.skipKeywords, "、"))) } prompt.WriteString("\n请将需要翻译的部分翻译成中文,其他部分保持不变。\n\n") prompt.WriteString("原文:\n---\n") textToTranslate := p.extractTextForTranslation(result) prompt.WriteString(textToTranslate) prompt.WriteString("\n---") return prompt.String() } func (p *Parser) extractTextForTranslation(result *ParseResult) string { var text strings.Builder for _, seg := range result.Segments { switch seg.Type { case SegmentTypeText: text.WriteString(seg.Content) case SegmentTypeComment: text.WriteString(seg.Content) case SegmentTypeCodeBlock, SegmentTypeInlineCode: } } return text.String() } func (p *Parser) Reconstruct(result *ParseResult, translatedText string) string { translatedLines := strings.Split(translatedText, "\n") var output strings.Builder textIndex := 0 for _, seg := range result.Segments { switch seg.Type { case SegmentTypeText, SegmentTypeComment: if textIndex < len(translatedLines) { output.WriteString(translatedLines[textIndex]) textIndex++ } case SegmentTypeCodeBlock, SegmentTypeInlineCode: output.WriteString(seg.Content) } } return output.String() } func min(a, b int) int { if a < b { return a } return b }