2
0
Files
bot/packages/lib/markdown/serializer/serialize.ts
Baptiste Arnaud ff9c4726cc 🚸 Rewrite the markdown deserializer to improve br… (#1198)
<!-- This is an auto-generated comment: release notes by coderabbit.ai
-->

## Summary by CodeRabbit

- **Refactor**
- Updated markdown handling and serialization libraries for improved
performance and accuracy in text formatting.
- **New Features**
- Enhanced rich text and markdown conversion capabilities, providing
users with more reliable and seamless text formatting options.
- **Documentation**
- Added detailed documentation for markdown to rich text conversion and
vice versa, ensuring easier understanding and implementation for
developers.

<!-- end of auto-generated comment: release notes by coderabbit.ai -->
2024-01-31 08:03:13 +01:00

287 lines
8.7 KiB
TypeScript

import { BlockType, defaultNodeTypes, LeafType, NodeTypes } from './ast-types'
import escapeHtml from 'escape-html'
interface Options {
nodeTypes: NodeTypes
flavour?: 'common' | 'whatsapp'
listDepth?: number
ignoreParagraphNewline?: boolean
}
const isLeafNode = (node: BlockType | LeafType): node is LeafType => {
return typeof (node as LeafType).text === 'string'
}
const VOID_ELEMENTS: Array<keyof NodeTypes> = ['thematic_break', 'image']
const BREAK_TAG = '<br>'
export default function serialize(
chunk: BlockType | LeafType,
opts: Options = {
nodeTypes: defaultNodeTypes,
}
): string | undefined {
const {
nodeTypes: userNodeTypes = defaultNodeTypes,
ignoreParagraphNewline = false,
listDepth = -1,
} = opts
let text = (chunk as LeafType).text || ''
let type = (chunk as BlockType).type || ''
const nodeTypes: NodeTypes = {
...defaultNodeTypes,
...userNodeTypes,
heading: {
...defaultNodeTypes.heading,
...userNodeTypes.heading,
},
}
if ('type' in chunk && chunk.type === nodeTypes['inline-variable'])
return chunk.children
.map((child) =>
serialize(
{
...child,
parentType: nodeTypes['inline-variable'],
},
opts
)
)
.join('')
const LIST_TYPES = [nodeTypes.ul_list, nodeTypes.ol_list]
let children = text
if (!isLeafNode(chunk)) {
children = chunk.children
.map((c: BlockType | LeafType, index) => {
const isList = !isLeafNode(c)
? (LIST_TYPES as string[]).includes(c.type || '')
: false
const selfIsList = (LIST_TYPES as string[]).includes(chunk.type || '')
// Links can have the following shape
// In which case we don't want to surround
// with break tags
// {
// type: 'paragraph',
// children: [
// { text: '' },
// { type: 'link', children: [{ text: foo.com }]}
// { text: '' }
// ]
// }
let childrenHasLink = false
if (!isLeafNode(chunk) && Array.isArray(chunk.children)) {
childrenHasLink = chunk.children.some(
(f) => !isLeafNode(f) && f.type === nodeTypes.link
)
}
return serialize(
{
...c,
parentType: type === nodeTypes.listItem ? chunk.parentType : type,
listIndex:
type === nodeTypes.listItem ? chunk.listIndex : index + 1,
},
{
flavour: opts.flavour,
nodeTypes,
// WOAH.
// what we're doing here is pretty tricky, it relates to the block below where
// we check for ignoreParagraphNewline and set type to paragraph.
// We want to strip out empty paragraphs sometimes, but other times we don't.
// If we're the descendant of a list, we know we don't want a bunch
// of whitespace. If we're parallel to a link we also don't want
// to respect neighboring paragraphs
ignoreParagraphNewline:
(ignoreParagraphNewline ||
isList ||
selfIsList ||
childrenHasLink) &&
// if we have c.break, never ignore empty paragraph new line
!(c as BlockType).break,
// track depth of nested lists so we can add proper spacing
listDepth: (LIST_TYPES as string[]).includes(
(c as BlockType).type || ''
)
? listDepth + 1
: listDepth,
}
)
})
.join('')
}
// This is pretty fragile code, check the long comment where we iterate over children
if (
!ignoreParagraphNewline &&
(text === '' || text === '\n') &&
chunk.parentType === nodeTypes.paragraph
) {
type = nodeTypes.paragraph
children = BREAK_TAG
}
if (children === '' && !VOID_ELEMENTS.find((k) => nodeTypes[k] === type))
return
// Never allow decorating break tags with rich text formatting,
// this can malform generated markdown
// Also ensure we're only ever applying text formatting to leaf node
// level chunks, otherwise we can end up in a situation where
// we try applying formatting like to a node like this:
// "Text foo bar **baz**" resulting in "**Text foo bar **baz****"
// which is invalid markup and can mess everything up
if (children !== BREAK_TAG && isLeafNode(chunk)) {
if (chunk.strikeThrough && chunk.bold && chunk.italic) {
if (opts.flavour === 'whatsapp') {
children = retainWhitespaceAndFormat(children, '*_~')
} else {
children = retainWhitespaceAndFormat(children, '~~***')
}
} else if (chunk.bold && chunk.italic) {
if (opts.flavour === 'whatsapp') {
children = retainWhitespaceAndFormat(children, '*_')
} else {
children = retainWhitespaceAndFormat(children, '***')
}
} else {
if (chunk.bold) {
if (opts.flavour === 'whatsapp') {
children = retainWhitespaceAndFormat(children, '*')
} else {
children = retainWhitespaceAndFormat(children, '**')
}
}
if (chunk.italic) {
children = retainWhitespaceAndFormat(children, '_')
}
if (chunk.strikeThrough) {
children = retainWhitespaceAndFormat(children, '~~')
}
if (chunk.code) {
if (opts.flavour === 'whatsapp') {
children = retainWhitespaceAndFormat(children, '```')
} else {
children = retainWhitespaceAndFormat(children, '`')
}
}
}
}
if (chunk.parentType === nodeTypes['inline-variable']) {
if (opts.flavour === 'whatsapp') {
return children
}
return escapeHtml(children)
}
switch (type) {
case nodeTypes.heading[1]:
return `# ${children}\n`
case nodeTypes.heading[2]:
return `## ${children}\n`
case nodeTypes.heading[3]:
return `### ${children}\n`
case nodeTypes.heading[4]:
return `#### ${children}\n`
case nodeTypes.heading[5]:
return `##### ${children}\n`
case nodeTypes.heading[6]:
return `###### ${children}\n`
case nodeTypes.blockquote:
return `> ${children}\n`
case nodeTypes.code_block:
return `\`\`\`${
(chunk as BlockType).language || ''
}\n${children}\n\`\`\`\n`
case nodeTypes.link:
return `[${children}](${(chunk as any).url || ''})`
case nodeTypes.image:
return `![${(chunk as BlockType).caption}](${
(chunk as BlockType).link || ''
})`
case nodeTypes.listItemChild:
const isOL = chunk && chunk.parentType === nodeTypes.ol_list
const listIndex = 'listIndex' in chunk ? chunk.listIndex : undefined
const treatAsLeaf =
(chunk as BlockType).children.length === 1 &&
isLeafNode((chunk as BlockType).children[0])
let spacer = ''
for (let k = 0; listDepth > k; k++) {
if (isOL) {
// https://github.com/remarkjs/remark-react/issues/65
spacer += ' '
} else {
spacer += ' '
}
}
return `${spacer}${isOL ? `${listIndex}.` : '-'} ${children}${
treatAsLeaf ? '\n' : ''
}`
case nodeTypes.paragraph:
return `${children}\n`
case nodeTypes.thematic_break:
return `---\n`
default: {
if (opts.flavour === 'whatsapp') {
return children
}
return escapeHtml(children)
}
}
}
// This function handles the case of a string like this: " foo "
// Where it would be invalid markdown to generate this: "** foo **"
// We instead, want to trim the whitespace out, apply formatting, and then
// bring the whitespace back. So our returned string looks like this: " **foo** "
function retainWhitespaceAndFormat(string: string, format: string) {
// we keep this for a comparison later
const frozenString = string.trim()
// children will be mutated
let children = frozenString
// We reverse the right side formatting, to properly handle bold/italic and strikeThrough
// formats, so we can create ~~***FooBar***~~
const fullFormat = `${format}${children}${reverseStr(format)}`
// This conditions accounts for no whitespace in our string
// if we don't have any, we can return early.
if (children.length === string.length) {
return fullFormat
}
// if we do have whitespace, let's add our formatting around our trimmed string
// We reverse the right side formatting, to properly handle bold/italic and strikeThrough
// formats, so we can create ~~***FooBar***~~
const formattedString = format + children + reverseStr(format)
// and replace the non-whitespace content of the string
return string.replace(frozenString, formattedString)
}
const reverseStr = (string: string) => string.split('').reverse().join('')