Convert formatting entities to Markdown

This commit is contained in:
bodqhrohro 2020-01-09 23:16:40 +02:00
parent b8fcac6ae2
commit 70383bee12
4 changed files with 411 additions and 8 deletions

View file

@ -4,7 +4,7 @@ all:
go build -o telegabber
test:
go test -v ./config ./ ./telegram ./xmpp/gateway ./persistence
go test -v ./config ./ ./telegram ./xmpp/gateway ./persistence ./telegram/formatter
lint:
$(GOPATH)/bin/golint ./...

View file

@ -0,0 +1,165 @@
package formatter
import (
"sort"
log "github.com/sirupsen/logrus"
"github.com/zelenin/go-tdlib/client"
)
// Insertion is a piece of text in given position
type Insertion struct {
Offset int32
Runes []rune
}
// InsertionStack contains the sequence of insertions
// from the start or from the end
type InsertionStack []*Insertion
var boldRunes = []rune("**")
var italicRunes = []rune("_")
var codeRunes = []rune("\n```\n")
var urlRuneL = []rune("[")
// rebalance pumps all the values at given offset to current stack (growing
// from start) from given stack (growing from end); should be called
// before any insertions to the current stack at the given offset
func (s InsertionStack) rebalance(s2 InsertionStack, offset int32) (InsertionStack, InsertionStack) {
for len(s2) > 0 && s2[len(s2)-1].Offset <= offset {
s = append(s, s2[len(s2)-1])
s2 = s2[:len(s2)-1]
}
return s, s2
}
// NewIterator is a second order function that sequentially scans and returns
// stack elements; starts returning nil when elements are ended
func (s InsertionStack) NewIterator() func() *Insertion {
i := -1
return func() *Insertion {
i++
if i < len(s) {
return s[i]
}
return nil
}
}
// SortEntities arranges the entities in traversal-ready order
func SortEntities(entities []*client.TextEntity) []*client.TextEntity {
sortedEntities := make([]*client.TextEntity, len(entities))
copy(sortedEntities, entities)
sort.Slice(sortedEntities, func(i int, j int) bool {
entity1 := entities[i]
entity2 := entities[j]
if entity1.Offset < entity2.Offset {
return true
} else if entity1.Offset == entity2.Offset {
return entity1.Length > entity2.Length
}
return false
})
return sortedEntities
}
func markupBraces(entity *client.TextEntity, lbrace, rbrace []rune) (*Insertion, *Insertion) {
return &Insertion{
Offset: entity.Offset,
Runes: lbrace,
}, &Insertion{
Offset: entity.Offset + entity.Length,
Runes: rbrace,
}
}
// EntityToMarkdown generates the wrapping Markdown tags
func EntityToMarkdown(entity *client.TextEntity) (*Insertion, *Insertion) {
switch entity.Type.TextEntityTypeType() {
case client.TypeTextEntityTypeBold:
return markupBraces(entity, boldRunes, boldRunes)
case client.TypeTextEntityTypeItalic:
return markupBraces(entity, italicRunes, italicRunes)
case client.TypeTextEntityTypeCode, client.TypeTextEntityTypePre:
return markupBraces(entity, codeRunes, codeRunes)
case client.TypeTextEntityTypePreCode:
preCode, _ := entity.Type.(*client.TextEntityTypePreCode)
return markupBraces(entity, []rune("\n```"+preCode.Language+"\n"), codeRunes)
case client.TypeTextEntityTypeTextUrl:
textURL, _ := entity.Type.(*client.TextEntityTypeTextUrl)
return markupBraces(entity, urlRuneL, []rune("]("+textURL.Url+")"))
}
return nil, nil
}
// Format traverses an already sorted list of entities and wraps the text in Markdown
func Format(
sourceText string,
entities []*client.TextEntity,
entityToMarkup func(*client.TextEntity) (*Insertion, *Insertion),
) string {
if len(entities) == 0 {
return sourceText
}
startStack := make(InsertionStack, 0, len(sourceText))
endStack := make(InsertionStack, 0, len(sourceText))
// convert entities to a stack of brackets
var maxEndOffset int32
for _, entity := range entities {
log.Debugf("%#v", entity)
if entity.Length <= 0 {
continue
}
endOffset := entity.Offset + entity.Length
if endOffset > maxEndOffset {
maxEndOffset = endOffset
}
startStack, endStack = startStack.rebalance(endStack, entity.Offset)
startInsertion, endInsertion := entityToMarkup(entity)
if startInsertion != nil {
startStack = append(startStack, startInsertion)
}
if endInsertion != nil {
endStack = append(endStack, endInsertion)
}
}
// flush the closing brackets that still remain in endStack
startStack, endStack = startStack.rebalance(endStack, maxEndOffset)
// merge brackets into text
markupRunes := make([]rune, 0, len(sourceText))
nextInsertion := startStack.NewIterator()
insertion := nextInsertion()
var runeI int32
for _, cp := range sourceText {
for insertion != nil && insertion.Offset <= runeI {
markupRunes = append(markupRunes, insertion.Runes...)
insertion = nextInsertion()
}
markupRunes = append(markupRunes, cp)
// skip two UTF-16 code units (not points actually!) if needed
if cp > 0x0000ffff {
runeI += 2
} else {
runeI++
}
}
for insertion != nil {
markupRunes = append(markupRunes, insertion.Runes...)
insertion = nextInsertion()
}
return string(markupRunes)
}

View file

@ -0,0 +1,208 @@
package formatter
import (
"testing"
"github.com/zelenin/go-tdlib/client"
)
func TestNoFormatting(t *testing.T) {
markup := Format("abc\ndef", []*client.TextEntity{}, EntityToMarkdown)
if markup != "abc\ndef" {
t.Errorf("No formatting expected, but: %v", markup)
}
}
func TestFormattingSimple(t *testing.T) {
markup := Format("👙🐧🐖", []*client.TextEntity{
&client.TextEntity{
Offset: 2,
Length: 4,
Type: &client.TextEntityTypeBold{},
},
}, EntityToMarkdown)
if markup != "👙**🐧🐖**" {
t.Errorf("Wrong simple formatting: %v", markup)
}
}
func TestFormattingAdjacent(t *testing.T) {
markup := Format("a👙🐧🐖", []*client.TextEntity{
&client.TextEntity{
Offset: 3,
Length: 2,
Type: &client.TextEntityTypeItalic{},
},
&client.TextEntity{
Offset: 5,
Length: 2,
Type: &client.TextEntityTypeTextUrl{
Url: "https://narayana.im/",
},
},
}, EntityToMarkdown)
if markup != "a👙_🐧_[🐖](https://narayana.im/)" {
t.Errorf("Wrong adjacent formatting: %v", markup)
}
}
func TestFormattingAdjacentAndNested(t *testing.T) {
markup := Format("👙🐧🐖", []*client.TextEntity{
&client.TextEntity{
Offset: 0,
Length: 4,
Type: &client.TextEntityTypePre{},
},
&client.TextEntity{
Offset: 0,
Length: 2,
Type: &client.TextEntityTypeBold{},
},
&client.TextEntity{
Offset: 4,
Length: 2,
Type: &client.TextEntityTypeItalic{},
},
}, EntityToMarkdown)
if markup != "\n```\n**👙**🐧\n```\n_🐖_" {
t.Errorf("Wrong adjacent&nested formatting: %v", markup)
}
}
func TestRebalanceTwoZero(t *testing.T) {
s1 := InsertionStack{
&Insertion{Offset: 7},
&Insertion{Offset: 8},
}
s2 := InsertionStack{}
s1, s2 = s1.rebalance(s2, 7)
if !(len(s1) == 2 && len(s2) == 0 && s1[0].Offset == 7 && s1[1].Offset == 8) {
t.Errorf("Wrong rebalance 20: %#v %#v", s1, s2)
}
}
func TestRebalanceNeeded(t *testing.T) {
s1 := InsertionStack{
&Insertion{Offset: 7},
&Insertion{Offset: 8},
}
s2 := InsertionStack{
&Insertion{Offset: 10},
&Insertion{Offset: 9},
}
s1, s2 = s1.rebalance(s2, 9)
if !(len(s1) == 3 && len(s2) == 1 &&
s1[0].Offset == 7 && s1[1].Offset == 8 && s1[2].Offset == 9 &&
s2[0].Offset == 10) {
t.Errorf("Wrong rebalance when needed: %#v %#v", s1, s2)
}
}
func TestRebalanceNotNeeded(t *testing.T) {
s1 := InsertionStack{
&Insertion{Offset: 7},
&Insertion{Offset: 8},
}
s2 := InsertionStack{
&Insertion{Offset: 10},
&Insertion{Offset: 9},
}
s1, s2 = s1.rebalance(s2, 8)
if !(len(s1) == 2 && len(s2) == 2 &&
s1[0].Offset == 7 && s1[1].Offset == 8 &&
s2[0].Offset == 10 && s2[1].Offset == 9) {
t.Errorf("Wrong rebalance when not needed: %#v %#v", s1, s2)
}
}
func TestRebalanceLate(t *testing.T) {
s1 := InsertionStack{
&Insertion{Offset: 7},
&Insertion{Offset: 8},
}
s2 := InsertionStack{
&Insertion{Offset: 10},
&Insertion{Offset: 9},
}
s1, s2 = s1.rebalance(s2, 10)
if !(len(s1) == 4 && len(s2) == 0 &&
s1[0].Offset == 7 && s1[1].Offset == 8 &&
s1[2].Offset == 9 && s1[3].Offset == 10) {
t.Errorf("Wrong rebalance when late: %#v %#v", s1, s2)
}
}
func TestIteratorEmpty(t *testing.T) {
s := InsertionStack{}
g := s.NewIterator()
v := g()
if v != nil {
t.Errorf("Empty iterator should return nil but returned %#v", v)
}
}
func TestIterator(t *testing.T) {
s := InsertionStack{
&Insertion{Offset: 7},
&Insertion{Offset: 8},
}
g := s.NewIterator()
v := g()
if v == nil || v.Offset != 7 {
t.Errorf("Wrong insertion instead of 7: %#v", v)
}
v = g()
if v == nil || v.Offset != 8 {
t.Errorf("Wrong insertion instead of 8: %#v", v)
}
v = g()
if v != nil {
t.Errorf("nil should be returned after end, %#v instead", v)
}
v = g()
if v != nil {
t.Errorf("Further attempts should return nil too, %#v instead", v)
}
}
func TestSortEntities(t *testing.T) {
entities := []*client.TextEntity{
&client.TextEntity{
Offset: 3,
Length: 2,
},
&client.TextEntity{
Offset: 5,
Length: 2,
},
&client.TextEntity{
Offset: 7,
Length: 2,
},
&client.TextEntity{
Offset: 6,
Length: 1,
},
&client.TextEntity{
Offset: 5,
Length: 1,
},
}
entities = SortEntities(entities)
if !(len(entities) == 5 &&
entities[0].Offset == 3 && entities[0].Length == 2 &&
entities[1].Offset == 5 && entities[1].Length == 2 &&
entities[2].Offset == 5 && entities[2].Length == 1 &&
entities[3].Offset == 6 && entities[3].Length == 1 &&
entities[4].Offset == 7 && entities[4].Length == 2) {
t.Errorf("Wrong sorting order: %#v", entities)
}
}
func TestSortEmpty(t *testing.T) {
entities := []*client.TextEntity{}
entities = SortEntities(entities)
if len(entities) != 0 {
t.Errorf("Empty entities set sorting error: %#v", entities)
}
}

View file

@ -15,6 +15,7 @@ import (
"time"
"dev.narayana.im/narayana/telegabber/telegram/cache"
"dev.narayana.im/narayana/telegabber/telegram/formatter"
"dev.narayana.im/narayana/telegabber/xmpp/gateway"
log "github.com/sirupsen/logrus"
@ -281,6 +282,7 @@ func (c *Client) formatContent(file *client.File, filename string) string {
}
func (c *Client) messageToText(message *client.Message) string {
markupFunction := formatter.EntityToMarkdown
switch message.Content.MessageContentType() {
case client.TypeMessageSticker:
sticker, _ := message.Content.(*client.MessageSticker)
@ -318,27 +320,55 @@ func (c *Client) messageToText(message *client.Message) string {
)
case client.TypeMessagePhoto:
photo, _ := message.Content.(*client.MessagePhoto)
return photo.Caption.Text
return formatter.Format(
photo.Caption.Text,
formatter.SortEntities(photo.Caption.Entities),
markupFunction,
)
case client.TypeMessageAudio:
audio, _ := message.Content.(*client.MessageAudio)
return audio.Caption.Text
return formatter.Format(
audio.Caption.Text,
formatter.SortEntities(audio.Caption.Entities),
markupFunction,
)
case client.TypeMessageVideo:
video, _ := message.Content.(*client.MessageVideo)
return video.Caption.Text
return formatter.Format(
video.Caption.Text,
formatter.SortEntities(video.Caption.Entities),
markupFunction,
)
case client.TypeMessageDocument:
document, _ := message.Content.(*client.MessageDocument)
return document.Caption.Text
return formatter.Format(
document.Caption.Text,
formatter.SortEntities(document.Caption.Entities),
markupFunction,
)
case client.TypeMessageText:
text, _ := message.Content.(*client.MessageText)
return text.Text.Text
return formatter.Format(
text.Text.Text,
formatter.SortEntities(text.Text.Entities),
markupFunction,
)
case client.TypeMessageVoiceNote:
voice, _ := message.Content.(*client.MessageVoiceNote)
return voice.Caption.Text
return formatter.Format(
voice.Caption.Text,
formatter.SortEntities(voice.Caption.Entities),
markupFunction,
)
case client.TypeMessageVideoNote:
return ""
case client.TypeMessageAnimation:
animation, _ := message.Content.(*client.MessageAnimation)
return animation.Caption.Text
return formatter.Format(
animation.Caption.Text,
formatter.SortEntities(animation.Caption.Entities),
markupFunction,
)
}
return fmt.Sprintf("unknown message (%s)", message.Content.MessageContentType())