-
Notifications
You must be signed in to change notification settings - Fork 15
Improve validation performance by caching compiled schematron Templates to avoid repeated XSLT compilation #1572
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
c81923f
521c9ee
0d498a3
028a625
c0cae5d
247d0b2
3b3757a
93158af
e84ab06
29999d0
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -20,9 +20,15 @@ | |
| import java.io.StringWriter; | ||
| import java.net.URL; | ||
| import java.nio.charset.StandardCharsets; | ||
| import java.security.MessageDigest; | ||
| import java.security.NoSuchAlgorithmException; | ||
| import java.util.HexFormat; | ||
| import java.util.Map; | ||
| import java.util.concurrent.ConcurrentHashMap; | ||
| import javax.xml.parsers.DocumentBuilderFactory; | ||
| import javax.xml.XMLConstants; | ||
| import javax.xml.transform.Source; | ||
| import javax.xml.transform.Templates; | ||
| import javax.xml.transform.Transformer; | ||
| import javax.xml.transform.TransformerConfigurationException; | ||
| import javax.xml.transform.TransformerException; | ||
|
|
@@ -44,6 +50,7 @@ | |
| */ | ||
| public class SchematronTransformer { | ||
| private static final Logger LOG = LoggerFactory.getLogger(SchematronTransformer.class); | ||
| private final Map<String, Templates> cachedTemplates = new ConcurrentHashMap<>(); | ||
|
|
||
| /** | ||
| * Constructor. | ||
|
|
@@ -97,23 +104,69 @@ public Transformer transform(Source source) throws TransformerException { | |
| * | ||
| * @throws TransformerException If an error occurred during the transform process. | ||
| */ | ||
| // Not cached — Source objects are not stably comparable/hashable. | ||
| // In practice LabelValidator only calls the String-based overload (line 622), | ||
| // so this uncached path does not affect bundle-validation performance. | ||
| public Transformer transform(Source source, ProblemHandler handler) throws TransformerException { | ||
| Transformer isoTransformer = this.buildIsoTransformer(); | ||
| return compileSchematron(source, handler).newTransformer(); | ||
| } | ||
|
|
||
| private Templates compileSchematron(Source source, ProblemHandler handler) throws TransformerException { | ||
| Transformer isoTransformer = this.buildIsoTransformer(); | ||
| if (handler != null) { | ||
| isoTransformer.setErrorListener(new TransformerErrorListener(handler)); | ||
| } | ||
| StringWriter schematronStyleSheet = new StringWriter(); | ||
| isoTransformer.transform(source, new StreamResult(schematronStyleSheet)); | ||
| return TransformerFactory.newInstance() | ||
| .newTransformer(new StreamSource(new StringReader(schematronStyleSheet.toString()))); | ||
| .newTemplates(new StreamSource(new StringReader(schematronStyleSheet.toString()))); | ||
| } | ||
|
Comment on lines
121
to
123
|
||
|
|
||
| public Transformer transform(String source) throws TransformerException { | ||
| return this.transform(source, null); | ||
| } | ||
| public Transformer transform(String source, ProblemHandler handler) throws TransformerException { | ||
| return transform (new StreamSource(new StringReader(source)), handler); | ||
| String key = sha256(source); | ||
| // Intentional check-then-act: a concurrent miss may compile twice, but | ||
| // both results are equivalent and the race is self-healing after warmup. | ||
| // computeIfAbsent is avoided because it holds a lock during compilation. | ||
| // handler is intentionally unused on cache hit — compilation errors | ||
| // would only occur during the first call (cache miss), not on reuse. | ||
| Templates templates = cachedTemplates.get(key); | ||
| if (templates == null) { | ||
| LOG.debug("transform: cache miss, compiling schematron (length={})", source.length()); | ||
| templates = compileSchematron(new StreamSource(new StringReader(source)), handler); | ||
| cachedTemplates.put(key, templates); | ||
|
Comment on lines
+129
to
+139
|
||
| } else { | ||
| LOG.debug("transform: cache hit, reusing compiled schematron (length={})", source.length()); | ||
| } | ||
| return templates.newTransformer(); | ||
| } | ||
|
|
||
| // Package-private: cache lifetime is tied to the SchematronTransformer instance. | ||
| // LabelValidator.clear() creates a new instance (line 219), which naturally | ||
| // starts with an empty cache. Exposed for testing. | ||
| void clearCache() { | ||
| cachedTemplates.clear(); | ||
|
Comment on lines
+146
to
+150
|
||
| } | ||
|
|
||
| /** Returns the number of cached templates entries. Package-private for testing. */ | ||
| int cacheSize() { | ||
| return cachedTemplates.size(); | ||
| } | ||
|
|
||
| private static final ThreadLocal<MessageDigest> SHA256 = ThreadLocal.withInitial(() -> { | ||
| try { | ||
| return MessageDigest.getInstance("SHA-256"); | ||
| } catch (NoSuchAlgorithmException e) { | ||
| throw new RuntimeException(e); | ||
| } | ||
| }); | ||
|
|
||
| private static String sha256(String input) { | ||
| MessageDigest digest = SHA256.get(); | ||
| digest.reset(); | ||
| return HexFormat.of().formatHex(digest.digest(input.getBytes(StandardCharsets.UTF_8))); | ||
| } | ||
| /** | ||
| * Transform the given schematron. | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,106 @@ | ||
| package gov.nasa.pds.tools.label; | ||
|
|
||
| import static org.junit.jupiter.api.Assertions.assertEquals; | ||
| import static org.junit.jupiter.api.Assertions.assertNotNull; | ||
| import static org.junit.jupiter.api.Assertions.assertNotSame; | ||
|
|
||
| import javax.xml.transform.Transformer; | ||
| import org.junit.jupiter.api.BeforeEach; | ||
| import org.junit.jupiter.api.Tag; | ||
| import org.junit.jupiter.api.Test; | ||
|
|
||
| /** | ||
| * Tests for {@link SchematronTransformer} cache behavior. | ||
| * | ||
| * <p>These tests exercise the full ISO schematron → XSLT compilation pipeline and | ||
| * therefore require Saxon-HE on the classpath. They are slower than pure unit tests | ||
| * on first run due to XSLT compilation overhead. | ||
| */ | ||
| @Tag("integration") | ||
| public class SchematronTransformerTest { | ||
|
|
||
| /** Minimal valid ISO Schematron document. */ | ||
| private static final String MINIMAL_SCHEMATRON = | ||
| "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n" | ||
| + "<sch:schema xmlns:sch=\"http://purl.oclc.org/dsdl/schematron\" queryBinding=\"xslt2\">\n" | ||
| + " <sch:title>Minimal test schematron</sch:title>\n" | ||
| + " <sch:pattern>\n" | ||
| + " <sch:rule context=\"/root\">\n" | ||
| + " <sch:assert test=\"true()\">Always passes.</sch:assert>\n" | ||
| + " </sch:rule>\n" | ||
| + " </sch:pattern>\n" | ||
| + "</sch:schema>"; | ||
|
|
||
| /** A different schematron to verify distinct cache entries. */ | ||
| private static final String OTHER_SCHEMATRON = | ||
| "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n" | ||
| + "<sch:schema xmlns:sch=\"http://purl.oclc.org/dsdl/schematron\" queryBinding=\"xslt2\">\n" | ||
| + " <sch:title>Other test schematron</sch:title>\n" | ||
| + " <sch:pattern>\n" | ||
| + " <sch:rule context=\"/other\">\n" | ||
| + " <sch:assert test=\"true()\">Also passes.</sch:assert>\n" | ||
| + " </sch:rule>\n" | ||
| + " </sch:pattern>\n" | ||
| + "</sch:schema>"; | ||
|
|
||
| private SchematronTransformer transformer; | ||
|
|
||
| @BeforeEach | ||
| void setUp() throws Exception { | ||
| transformer = new SchematronTransformer(); | ||
| } | ||
|
|
||
| @Test | ||
| void testCachePopulatedOnFirstCall() throws Exception { | ||
| assertEquals(0, transformer.cacheSize(), "cache should start empty"); | ||
|
|
||
| Transformer t = transformer.transform(MINIMAL_SCHEMATRON); | ||
| assertNotNull(t); | ||
| assertEquals(1, transformer.cacheSize(), "cache should have one entry after first call"); | ||
| } | ||
|
|
||
| @Test | ||
| void testCacheHitOnRepeatedCall() throws Exception { | ||
| Transformer t1 = transformer.transform(MINIMAL_SCHEMATRON); | ||
| assertEquals(1, transformer.cacheSize()); | ||
|
|
||
| Transformer t2 = transformer.transform(MINIMAL_SCHEMATRON); | ||
| assertEquals(1, transformer.cacheSize(), "cache size should remain 1 after repeated call"); | ||
|
|
||
| // Each call should return a distinct Transformer (Transformer is not thread-safe) | ||
| assertNotSame(t1, t2, "each call should produce a new Transformer instance"); | ||
| } | ||
|
|
||
| @Test | ||
| void testDistinctSchematronsGetSeparateCacheEntries() throws Exception { | ||
| transformer.transform(MINIMAL_SCHEMATRON); | ||
| assertEquals(1, transformer.cacheSize()); | ||
|
|
||
| transformer.transform(OTHER_SCHEMATRON); | ||
| assertEquals(2, transformer.cacheSize(), "distinct schematrons should produce separate cache entries"); | ||
| } | ||
|
|
||
| @Test | ||
| void testClearCacheRemovesAllEntries() throws Exception { | ||
| transformer.transform(MINIMAL_SCHEMATRON); | ||
| transformer.transform(OTHER_SCHEMATRON); | ||
| assertEquals(2, transformer.cacheSize()); | ||
|
|
||
| transformer.clearCache(); | ||
| assertEquals(0, transformer.cacheSize(), "cache should be empty after clearCache()"); | ||
| } | ||
|
|
||
| @Test | ||
| void testCacheMissAfterClear() throws Exception { | ||
| transformer.transform(MINIMAL_SCHEMATRON); | ||
| assertEquals(1, transformer.cacheSize()); | ||
|
|
||
| transformer.clearCache(); | ||
| assertEquals(0, transformer.cacheSize()); | ||
|
|
||
| // Re-compile should repopulate the cache | ||
| Transformer t = transformer.transform(MINIMAL_SCHEMATRON); | ||
| assertNotNull(t); | ||
| assertEquals(1, transformer.cacheSize(), "cache should repopulate after clear"); | ||
| } | ||
| } |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The new inline comment references specific line numbers in
LabelValidator(e.g., “line 622”). These line numbers will drift over time and make the comment misleading. Prefer referencing the method name/call site (e.g.,LabelValidator.parseAndValidate()uses the String overload) without hard-coded line numbers.