diff --git a/formcycle-adapter/formcycle-adapter-impl/src/main/java/de/ozgcloud/eingang/formcycle/FormDataHtmlCleaner.java b/formcycle-adapter/formcycle-adapter-impl/src/main/java/de/ozgcloud/eingang/formcycle/FormDataHtmlCleaner.java index 5e5c45a1d163167fcfa5dadeec5f798642e682ea..7ad155733dbc3f2ccce5aefd9b454b5bf10424fd 100644 --- a/formcycle-adapter/formcycle-adapter-impl/src/main/java/de/ozgcloud/eingang/formcycle/FormDataHtmlCleaner.java +++ b/formcycle-adapter/formcycle-adapter-impl/src/main/java/de/ozgcloud/eingang/formcycle/FormDataHtmlCleaner.java @@ -31,6 +31,7 @@ import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.safety.Safelist; import org.springframework.stereotype.Component; +import org.springframework.web.util.HtmlUtils; import de.ozgcloud.eingang.common.formdata.FormData; @@ -57,13 +58,14 @@ public class FormDataHtmlCleaner { return value; } - Object parseHtml(String html) { - var jsoupDocument = Jsoup.parse(html); + String parseHtml(String html) { var outputSettings = new Document.OutputSettings(); // keep new lines outputSettings.prettyPrint(false); - jsoupDocument.outputSettings(outputSettings); - var innerHtml = jsoupDocument.html().replace("\\\\n", "\n"); - return Jsoup.clean(innerHtml, "", Safelist.none(), outputSettings); + var htmlText = Jsoup.clean(replaceNewLines(html), "", Safelist.none(), outputSettings); + return HtmlUtils.htmlUnescape(htmlText); } + private String replaceNewLines(String html) { + return html.replace("\\\\n", "\n"); + } } diff --git a/formcycle-adapter/formcycle-adapter-impl/src/test/java/de/ozgcloud/eingang/formcycle/FormDataHtmlCleanerITCase.java b/formcycle-adapter/formcycle-adapter-impl/src/test/java/de/ozgcloud/eingang/formcycle/FormDataHtmlCleanerITCase.java new file mode 100644 index 0000000000000000000000000000000000000000..8cfcd3e7db38c32f2d7f66740b0f2f5678f264bd --- /dev/null +++ b/formcycle-adapter/formcycle-adapter-impl/src/test/java/de/ozgcloud/eingang/formcycle/FormDataHtmlCleanerITCase.java @@ -0,0 +1,89 @@ +/* + * Copyright (C) 2023 Das Land Schleswig-Holstein vertreten durch den + * Ministerpräsidenten des Landes Schleswig-Holstein + * Staatskanzlei + * Abteilung Digitalisierung und zentrales IT-Management der Landesregierung + * + * Lizenziert unter der EUPL, Version 1.2 oder - sobald + * diese von der Europäischen Kommission genehmigt wurden - + * Folgeversionen der EUPL ("Lizenz"); + * Sie dürfen dieses Werk ausschließlich gemäß + * dieser Lizenz nutzen. + * Eine Kopie der Lizenz finden Sie hier: + * + * https://joinup.ec.europa.eu/collection/eupl/eupl-text-eupl-12 + * + * Sofern nicht durch anwendbare Rechtsvorschriften + * gefordert oder in schriftlicher Form vereinbart, wird + * die unter der Lizenz verbreitete Software "so wie sie + * ist", OHNE JEGLICHE GEWÄHRLEISTUNG ODER BEDINGUNGEN - + * ausdrücklich oder stillschweigend - verbreitet. + * Die sprachspezifischen Genehmigungen und Beschränkungen + * unter der Lizenz sind dem Lizenztext zu entnehmen. + */ +package de.ozgcloud.eingang.formcycle; + +import static org.assertj.core.api.Assertions.*; + +import java.util.Map; + +import org.junit.jupiter.api.Test; +import org.mockito.InjectMocks; +import org.mockito.Spy; + +import de.ozgcloud.eingang.common.formdata.FormDataTestFactory; + +class FormDataHtmlCleanerITCase { + + @Spy + @InjectMocks + private FormDataHtmlCleaner cleaner; + + static final String KEY_LABEL = "label"; + static final String KEY_VALUE = "value"; + + static final Map<String, Object> FORM_DATA_MAP = Map.of("tf1", Map.of( + KEY_LABEL, "<p><em>Label</em></p>", + KEY_VALUE, "<i>Value</i>"), + "fs1", Map.of( + KEY_LABEL, "Key", + KEY_VALUE, Map.of( + "tf3", Map.of( + KEY_LABEL, "<p><s>Label</s></p>", + KEY_VALUE, "ein Text mit <html><body><h1>Hello</h1><body><html>")), + "tf4", Map.of( + KEY_LABEL, "<p><span style=\"background-color:#1abc9c;\">Wichtig</span></p>", + KEY_VALUE, "Text"), + "ed1", Map.of( + KEY_LABEL, + "<ol>\n\t<li><em><strong><u>ganz</u></strong></em></li>\n\t<li><span style=\"color:#e74c3c;" + + "\">bunt</span></li>\n</ol>", + KEY_VALUE, "TExt\nmit\n Leerzeichen\nund\n Umbrüchen" + ))); + + static final Map<String, Object> EXPECTED_MAP = Map.of("tf1", Map.of( + KEY_LABEL, "Label", + KEY_VALUE, "Value"), + "fs1", Map.of( + KEY_LABEL, "Key", + KEY_VALUE, Map.of( + "tf3", Map.of( + KEY_LABEL, "Label", + KEY_VALUE, "ein Text mit Hello")), + "tf4", Map.of( + KEY_LABEL, "Wichtig", + KEY_VALUE, "Text"), + "ed1", Map.of( + KEY_LABEL, + "\n\tganz\n\tbunt\n", + KEY_VALUE, "TExt\nmit\n Leerzeichen\nund\n Umbrüchen" + ))); + + @Test + void shouldCleanHtml() { + var result = cleaner.clean(FormDataTestFactory.createBuilder().formData(FORM_DATA_MAP).build()); + + assertThat(result.getFormData()).usingRecursiveComparison().isEqualTo(EXPECTED_MAP); + } + +} \ No newline at end of file diff --git a/formcycle-adapter/formcycle-adapter-impl/src/test/java/de/ozgcloud/eingang/formcycle/FormDataHtmlCleanerTest.java b/formcycle-adapter/formcycle-adapter-impl/src/test/java/de/ozgcloud/eingang/formcycle/FormDataHtmlCleanerTest.java index 4f1109c25ba64e079a4bf6bf68d5828095ec1bc1..ad1218279c8210b3ceb2b5c5036a23e0ddee6927 100644 --- a/formcycle-adapter/formcycle-adapter-impl/src/test/java/de/ozgcloud/eingang/formcycle/FormDataHtmlCleanerTest.java +++ b/formcycle-adapter/formcycle-adapter-impl/src/test/java/de/ozgcloud/eingang/formcycle/FormDataHtmlCleanerTest.java @@ -30,8 +30,11 @@ import java.util.List; import java.util.Map; import org.assertj.core.data.MapEntry; +import org.junit.jupiter.api.DisplayName; import org.junit.jupiter.api.Nested; import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.CsvSource; import org.mockito.InjectMocks; import org.mockito.Spy; @@ -172,58 +175,31 @@ class FormDataHtmlCleanerTest { } @Nested - class TestHtmlCleaner { - - static final String KEY_LABEL = "label"; - static final String KEY_VALUE = "value"; - - static final Map<String, Object> FORM_DATA_MAP = Map.of("tf1", Map.of( - KEY_LABEL, "<p><em>Ä</em></p>", - KEY_VALUE, "Ä - Wert"), - "tf2", Map.of( - KEY_LABEL, "<p><strong>Ö</strong></p>", - KEY_VALUE, "Ö - Wert"), - "fs1", Map.of( - KEY_LABEL, "Ü", - KEY_VALUE, Map.of( - "tf3", Map.of( - KEY_LABEL, " <p><s>Label mit</s> ß</p>", - KEY_VALUE, "ein Text mit ß und <html><body><h1>Hello</h1><body><html>")), - "tf4", Map.of( - KEY_LABEL, "<p><span style=\"background-color:#1abc9c;\">ä</span></p>", - KEY_VALUE, "Text"), - "ed1", Map.of( - KEY_LABEL, - "<ol>\n\t<li><em><strong><u>ö</u></strong></em></li>\n\t<li><span style=\"color:#e74c3c;\">ü</span></li>\n</ol>", - KEY_VALUE, "TExt\nmit\n Leerzeichen\nund\n Umbrüchen" - ))); - - static final Map<String, Object> EXPECTED_MAP = Map.of("tf1", Map.of( - KEY_LABEL, "Ä", - KEY_VALUE, "Ä - Wert"), - "tf2", Map.of( - KEY_LABEL, "Ö", - KEY_VALUE, "Ö - Wert"), - "fs1", Map.of( - KEY_LABEL, "Ü", - KEY_VALUE, Map.of( - "tf3", Map.of( - KEY_LABEL, "Label mit ß", - KEY_VALUE, "ein Text mit ß und Hello")), - "tf4", Map.of( - KEY_LABEL, "ä", - KEY_VALUE, "Text"), - "ed1", Map.of( - KEY_LABEL, - "\n\tö\n\tü\n", - KEY_VALUE, "TExt\nmit\n Leerzeichen\nund\n Umbrüchen" - ))); + class TestParseHtml { @Test - void shouldCleanHtml() { - var result = cleaner.clean(FormData.builder().formData(FORM_DATA_MAP).build()); + void shouldParseHtml() { + var result = cleaner.parseHtml("<html><body><h1>Hello</h1><body><html>"); - assertThat(result.getFormData()).isEqualTo(EXPECTED_MAP); + assertThat(result).isEqualTo("Hello"); + } + + @DisplayName("should decode german characters") + @ParameterizedTest(name = "{0} => {2}") + @CsvSource({ "Ä, Ä", "Ö, Ö", "Ü, Ü", "ß, ß" }) + void shouldDecodeUmlaut(String srcValue, String expectedValue) { + var result = cleaner.parseHtml(srcValue); + + assertThat(result).isEqualTo(expectedValue); + } + + @DisplayName("should decode special characters") + @ParameterizedTest(name = "{0} => {1}") + @CsvSource(value = { "&, &", "<, <,", ">, >", "", \"", "', '" }, quoteCharacter = '`') + void shouldDecodeAmpersand(String src, String expected) { + var result = cleaner.parseHtml(src); + + assertThat(result).isEqualTo(expected); } } } \ No newline at end of file