Skip to content

Commit ea1313a

Browse files
authored
Merge pull request #1 from pegmalibrary/fix-atomic-tag-recognition
Fix the issue that in the parsing process tags were incorrectly recognized as atomic tags For example, the tag "<abb" was recognized as an "<a" tag and was thus handled as an atomic tag which is wrong. The commit [1] was taken as a template for the fix. [1] felipe-issa-zebra@b5497b4
2 parents 063d3ee + dd8fec4 commit ea1313a

File tree

5 files changed

+72
-20
lines changed

5 files changed

+72
-20
lines changed

README.md

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,10 @@ of these three parameters it will be ignored:
5353
not be compared - the entire tag should be treated as one token. This is useful for tags
5454
where it does not make sense to insert `<ins>` and `<del>` tags. If not used, the default
5555
list will be used:
56-
`iframe,object,math,svg,script,video,head,style`.
56+
`iframe,object,math,svg,script,video,head,style`.
57+
The tags specified here will be used as 'begin with'. So if tag 'i' is added, <i>
58+
tags will be treated as atomic, as well as <img>. If you wish to exclude <img> tag
59+
from the <i> one, configure it as 'i(?!mg)'
5760

5861

5962
### Example
@@ -75,7 +78,7 @@ Result:
7578
## Development
7679
* `npm install` to install dependencies
7780
* `npm run lint` to ESLint the TypeScript
78-
* `npm run make` to compile the TypeScript
81+
* `npm run build` to transpile the TypeScript to JavaScript
7982
* `npm run test` to run the tests
8083

8184
## Credits

dist/htmldiff.js

Lines changed: 28 additions & 11 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

dist/htmldiff.js.map

Lines changed: 1 addition & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

src/htmldiff.ts

Lines changed: 26 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,7 @@ function isEndOfHTMLComment(word: string): boolean {
6969
}
7070

7171
// Added head and style (for style tags inside the body)
72-
const atomicTagsRegExp = /^<(iframe|object|math|svg|script|video|head|style|a)/;
72+
const atomicTagsRegExp = /^<(iframe|object|math|svg|script|video|head|style|a)$/;
7373

7474
/**
7575
* Checks if the current word is the beginning of an atomic tag. An atomic tag is one whose
@@ -264,7 +264,7 @@ export function htmlToTokens(html: string): Token[] {
264264
const char = html[charIdx] as string;
265265
switch (mode){
266266
case 'tag': {
267-
const atomicTag = isStartOfAtomicTag(currentWord);
267+
const atomicTag = (' ' === char || '/' === char || '>' === char) ? isStartOfAtomicTag(currentWord) : false;
268268
const styleTag = isStartOfStyleTag(currentWord + char);
269269
const latestStyleTag = currentStyleTags.length && currentStyleTags[currentStyleTags.length - 1];
270270
const endOfStyleTag = isEndOfTag(char) && latestStyleTag && isEndOfStyleTag(currentWord, latestStyleTag);
@@ -313,7 +313,7 @@ export function htmlToTokens(html: string): Token[] {
313313
break;
314314
}
315315
case 'atomic_tag':
316-
if (isEndOfTag(char) && isEndOfAtomicTag(currentWord, currentAtomicTag)){
316+
if (isEndOfTag(char) && (isImage(currentWord + '>') || isEndOfAtomicTag(currentWord, currentAtomicTag))){
317317
currentWord += '>';
318318
words.push(createToken(currentWord, currentStyleTags, currentTableTags));
319319
currentWord = '';
@@ -445,6 +445,16 @@ function getKeyForToken(token: string){
445445
return token;
446446
}
447447

448+
/**
449+
* Checks if a given token is image
450+
*
451+
* @param {} token
452+
* @returns
453+
*/
454+
function isImage(token: string) {
455+
return /^<img.*src=['"]([^"']*)['"].*>$/.exec(token);
456+
}
457+
448458
const tokenMapKey = (token: Token) => token.key + JSON.stringify(token.styles) + JSON.stringify(token.tableTags);
449459

450460
/**
@@ -639,16 +649,16 @@ function getFullMatch(segment: Segment, beforeStart: number, afterStart: number,
639649
}
640650
}
641651

642-
// Extend the current match as far foward as it can go, without overflowing beforeTokens or
652+
// Extend the current match as far forward as it can go, without overflowing beforeTokens or
643653
// afterTokens.
644654
let searching = true;
645655
let currentLength = 1;
646656
let beforeIndex = beforeStart + currentLength;
647657
let afterIndex = afterStart + currentLength;
648658

649659
while (searching && beforeIndex < beforeTokens.length && afterIndex < afterTokens.length){
650-
const beforeWord = beforeTokens[beforeIndex]?.key;
651-
const afterWord = afterTokens[afterIndex]?.key;
660+
const beforeWord = getTextToCompare(beforeIndex, beforeTokens);
661+
const afterWord = getTextToCompare(afterIndex, afterTokens);
652662
const beforeStyle = JSON.stringify(beforeTokens[beforeIndex]?.styles);
653663
const afterStyle = JSON.stringify(afterTokens[afterIndex]?.styles);
654664
if (beforeWord === afterWord && beforeStyle === afterStyle){
@@ -675,6 +685,16 @@ function getFullMatch(segment: Segment, beforeStart: number, afterStart: number,
675685

676686
return makeMatch(beforeStart, afterStart, currentLength, segment);
677687
}
688+
689+
function getTextToCompare(index: number, tokens: any[]): string {
690+
const token = tokens[index];
691+
if (!token) {
692+
throw Error(`Expected ${tokens} to have an element at position ${index}`);
693+
}
694+
const key = !!isStartOfAtomicTag(token.key) ? 'string' : 'key';
695+
return token[key];
696+
}
697+
678698
type Segment = {
679699
beforeTokens: Token[];
680700
afterTokens: Token[];

test/diff.spec.js

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -225,4 +225,16 @@ describe('Diff', function(){
225225
});
226226
});
227227

228+
describe('processing tags', function(){
229+
it('should detect atomic tag correctly', function() {
230+
res = diff(
231+
'Some <abb class=" my-abb">Text</abb> within <embb class=" my-embb">custom tags</embb>',
232+
'Some <abb class=" my-abb"> other Text</abb> within <embb class=" my-embb">the same tags</embb>'
233+
);
234+
expect(res).to.equal(
235+
'Some <abb class=" my-abb"><ins data-operation-index="1"> other </ins>Text</abb> within <embb class=" my-embb"><del data-operation-index="3">custom</del><ins data-operation-index="3">the same</ins> tags</embb>'
236+
);
237+
});
238+
});
239+
228240
});

0 commit comments

Comments
 (0)