Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
104 changes: 104 additions & 0 deletions src/main/java/org/codelibs/fess/job/HtmlIndexExportFormatter.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
/*
* Copyright 2012-2025 CodeLibs Project and the Others.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
* either express or implied. See the License for the specific language
* governing permissions and limitations under the License.
*/
package org.codelibs.fess.job;

import java.util.Collection;
import java.util.Map;
import java.util.Set;

/**
* Formatter that outputs index documents as HTML files.
*/
public class HtmlIndexExportFormatter implements IndexExportFormatter {

/**
* Creates a new HtmlIndexExportFormatter instance.
*/
public HtmlIndexExportFormatter() {
// default constructor
}

@Override
public String getFileExtension() {
return ".html";
}

@Override
public String getIndexFileName() {
return "index.html";
}

@Override
public String format(final Map<String, Object> source, final Set<String> excludeFields) {
final String title = escapeHtml(getStringValue(source, "title"));
final String content = escapeHtml(getStringValue(source, "content"));
final String lang = escapeHtml(getStringValue(source, "lang"));

final StringBuilder html = new StringBuilder();
html.append("<!DOCTYPE html>\n");
html.append("<html lang=\"").append(lang).append("\">\n");
html.append("<head>\n");
html.append("<meta charset=\"UTF-8\">\n");
html.append("<title>").append(title).append("</title>\n");

for (final Map.Entry<String, Object> entry : source.entrySet()) {
final String field = entry.getKey();
if ("title".equals(field) || "content".equals(field) || "lang".equals(field)) {
continue;
}
if (excludeFields.contains(field)) {
continue;
}

final Object value = entry.getValue();
if (value instanceof Collection) {
for (final Object item : (Collection<?>) value) {
html.append("<meta name=\"fess:")
.append(escapeHtml(field))
.append("\" content=\"")
.append(escapeHtml(String.valueOf(item)))
.append("\">\n");
}
} else if (value != null) {
html.append("<meta name=\"fess:")
.append(escapeHtml(field))
.append("\" content=\"")
.append(escapeHtml(String.valueOf(value)))
.append("\">\n");
}
}

html.append("</head>\n");
html.append("<body>\n");
html.append(content).append("\n");
html.append("</body>\n");
html.append("</html>\n");

return html.toString();
}

private String getStringValue(final Map<String, Object> source, final String key) {
final Object value = source.get(key);
return value != null ? value.toString() : "";
}

private String escapeHtml(final String text) {
if (text == null || text.isEmpty()) {
return "";
}
return text.replace("&", "&amp;").replace("<", "&lt;").replace(">", "&gt;").replace("\"", "&quot;").replace("'", "&#39;");
}
}
48 changes: 48 additions & 0 deletions src/main/java/org/codelibs/fess/job/IndexExportFormatter.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
/*
* Copyright 2012-2025 CodeLibs Project and the Others.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
* either express or implied. See the License for the specific language
* governing permissions and limitations under the License.
*/
package org.codelibs.fess.job;

import java.util.Map;
import java.util.Set;

/**
* Strategy interface for formatting exported index documents.
*/
public interface IndexExportFormatter {

/**
* Returns the file extension for this format (e.g. ".html", ".json").
*
* @return the file extension including the leading dot
*/
String getFileExtension();

/**
* Returns the default index file name for this format (e.g. "index.html", "index.json").
*
* @return the index file name
*/
String getIndexFileName();

/**
* Formats a document source map into the target format string.
*
* @param source the document source map
* @param excludeFields the set of field names to exclude from output
* @return the formatted string
*/
String format(Map<String, Object> source, Set<String> excludeFields);
}
136 changes: 54 additions & 82 deletions src/main/java/org/codelibs/fess/job/IndexExportJob.java
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
import java.util.Arrays;
import java.util.Collection;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.atomic.AtomicLong;
Expand All @@ -39,8 +38,8 @@
import org.opensearch.index.query.QueryBuilders;

/**
* Job for exporting indexed search documents as HTML files to the filesystem.
* Each document is exported as a single HTML file with URL structure mapped to directory structure.
* Job for exporting indexed search documents to the filesystem.
* Each document is exported as a single file with URL structure mapped to directory structure.
*/
public class IndexExportJob {

Expand All @@ -50,6 +49,8 @@ public class IndexExportJob {

private QueryBuilder queryBuilder;

private IndexExportFormatter formatter;

/**
* Creates a new IndexExportJob instance.
*/
Expand All @@ -69,7 +70,39 @@ public IndexExportJob query(final QueryBuilder queryBuilder) {
}

/**
* Executes the export job, writing each matching document as an HTML file.
* Sets the export format.
*
* @param format the format name (e.g. "html", "json")
* @return this instance for method chaining
*/
public IndexExportJob format(final String format) {
this.formatter = createFormatter(format);
return this;
}

/**
* Creates a formatter for the given format name.
*
* @param format the format name
* @return the formatter instance
* @throws IllegalArgumentException if the format is null, empty, or not supported
*/
protected IndexExportFormatter createFormatter(final String format) {
if (format == null || format.trim().isEmpty()) {
throw new IllegalArgumentException("Export format must not be null or empty");
}
switch (format.trim().toLowerCase()) {
case "html":
return new HtmlIndexExportFormatter();
case "json":
return new JsonIndexExportFormatter();
default:
throw new IllegalArgumentException("Unsupported export format: " + format);
}
}
Comment on lines +90 to +102
Copy link

Copilot AI Feb 7, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

toLowerCase() should specify a locale to avoid locale-dependent behavior (e.g., Turkish locale casing issues). Use toLowerCase(Locale.ROOT) (and add the corresponding import) for stable format matching.

Copilot uses AI. Check for mistakes.

/**
* Executes the export job, writing each matching document as a file.
*
* @return a string containing the execution result or error messages
*/
Expand All @@ -86,6 +119,9 @@ public String execute() {
.collect(Collectors.toSet());
final int scrollSize = fessConfig.getIndexExportScrollSizeAsInteger();

final IndexExportFormatter resolvedFormatter =
this.formatter != null ? this.formatter : createFormatter(fessConfig.getIndexExportFormat());

final QueryBuilder query = queryBuilder != null ? queryBuilder : QueryBuilders.matchAllQuery();

if (logger.isInfoEnabled()) {
Expand All @@ -101,7 +137,7 @@ public String execute() {
requestBuilder.setQuery(query).setSize(scrollSize);
return true;
}, source -> {
exportDocument(source, exportPath, excludeFields);
exportDocument(source, exportPath, excludeFields, resolvedFormatter);
final long currentCount = processedCount.incrementAndGet();
if (logger.isDebugEnabled() && currentCount % scrollSize == 0) {
logger.debug("[EXPORT] Processing: count={}", currentCount);
Expand All @@ -121,29 +157,31 @@ public String execute() {
}

/**
* Exports a single document as an HTML file.
* Exports a single document as a file.
*
* @param source the document source map
* @param exportPath the base export directory path
* @param excludeFields the set of field names to exclude from output
* @param formatter the formatter to use for output
*/
protected void exportDocument(final Map<String, Object> source, final String exportPath, final Set<String> excludeFields) {
protected void exportDocument(final Map<String, Object> source, final String exportPath, final Set<String> excludeFields,
final IndexExportFormatter formatter) {
final Object urlObj = source.get("url");
if (urlObj == null) {
logger.debug("Skipping document without url field.");
return;
}

final String url = urlObj.toString();
final Path filePath = buildFilePath(exportPath, url);
final Path filePath = buildFilePath(exportPath, url, formatter);
if (logger.isDebugEnabled()) {
logger.debug("[EXPORT] Exporting document: url={}, path={}", url, filePath);
}
final String html = buildHtml(source, excludeFields);
final String content = formatter.format(source, excludeFields);

try {
Files.createDirectories(filePath.getParent());
Files.writeString(filePath, html, StandardCharsets.UTF_8);
Files.writeString(filePath, content, StandardCharsets.UTF_8);
} catch (final IOException e) {
logger.warn("Failed to export document: url={}", url, e);
}
Expand All @@ -154,9 +192,10 @@ protected void exportDocument(final Map<String, Object> source, final String exp
*
* @param exportPath the base export directory path
* @param url the document URL
* @param formatter the formatter to determine file extensions
* @return the target file path
*/
protected Path buildFilePath(final String exportPath, final String url) {
protected Path buildFilePath(final String exportPath, final String url, final IndexExportFormatter formatter) {
try {
final URI uri = new URI(url);
String host = uri.getHost();
Expand All @@ -167,11 +206,11 @@ protected Path buildFilePath(final String exportPath, final String url) {
}

if (path == null || path.isEmpty()) {
path = "/index.html";
path = "/" + formatter.getIndexFileName();
} else if (path.endsWith("/")) {
path = path + "index.html";
path = path + formatter.getIndexFileName();
} else if (!path.contains(".") || path.lastIndexOf('.') < path.lastIndexOf('/')) {
path = path + ".html";
path = path + formatter.getFileExtension();
}
Comment on lines 208 to 214
Copy link

Copilot AI Feb 7, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

When exporting JSON, buildFilePath() preserves an existing URL extension (e.g. /page.html stays .html) but the content written is JSON, which creates a file-extension/content mismatch and can break downstream consumers (and is surprising given index.export.format=json). Consider enforcing formatter.getFileExtension() for non-HTML formatters (e.g., replace any existing extension unless it already matches the formatter), or introduce an explicit option like “preserve original URL extension” and default it to false for JSON.

Copilot uses AI. Check for mistakes.

if (path.startsWith("/")) {
Expand All @@ -194,75 +233,8 @@ protected Path buildFilePath(final String exportPath, final String url) {
return Paths.get(exportPath, sanitized.toString());
} catch (final Exception e) {
logger.debug("Failed to parse URL: {}", url, e);
return Paths.get(exportPath, "_invalid", hashString(url) + ".html");
}
}

/**
* Builds an HTML string from a document source map.
*
* @param source the document source map
* @param excludeFields the set of field names to exclude from meta tags
* @return the generated HTML string
*/
protected String buildHtml(final Map<String, Object> source, final Set<String> excludeFields) {
final String title = escapeHtml(getStringValue(source, "title"));
final String content = escapeHtml(getStringValue(source, "content"));
final String lang = escapeHtml(getStringValue(source, "lang"));

final StringBuilder html = new StringBuilder();
html.append("<!DOCTYPE html>\n");
html.append("<html lang=\"").append(lang).append("\">\n");
html.append("<head>\n");
html.append("<meta charset=\"UTF-8\">\n");
html.append("<title>").append(title).append("</title>\n");

for (final Map.Entry<String, Object> entry : source.entrySet()) {
final String field = entry.getKey();
if ("title".equals(field) || "content".equals(field) || "lang".equals(field)) {
continue;
}
if (excludeFields.contains(field)) {
continue;
}

final Object value = entry.getValue();
if (value instanceof Collection) {
for (final Object item : (Collection<?>) value) {
html.append("<meta name=\"fess:")
.append(escapeHtml(field))
.append("\" content=\"")
.append(escapeHtml(String.valueOf(item)))
.append("\">\n");
}
} else if (value != null) {
html.append("<meta name=\"fess:")
.append(escapeHtml(field))
.append("\" content=\"")
.append(escapeHtml(String.valueOf(value)))
.append("\">\n");
}
}

html.append("</head>\n");
html.append("<body>\n");
html.append(content).append("\n");
html.append("</body>\n");
html.append("</html>\n");

return html.toString();
}

private String getStringValue(final Map<String, Object> source, final String key) {
final Object value = source.get(key);
return value != null ? value.toString() : "";
}

private String escapeHtml(final String text) {
if (text == null || text.isEmpty()) {
return "";
return Paths.get(exportPath, "_invalid", hashString(url) + formatter.getFileExtension());
}
return text.replace("&", "&amp;").replace("<", "&lt;").replace(">", "&gt;").replace("\"", "&quot;").replace("'", "&#39;");
}

private String hashString(final String input) {
Expand Down
Loading