Skip to content

Add Feature: Metadata Extraction from Files #2311

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 12 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,8 @@ Integrates with Spring Data, Spring Data REST and Apache Solr</description>
<module>./spring-content-s3-boot-starter</module>
<module>./spring-content-renditions</module>
<module>./spring-content-renditions-boot-starter</module>
<module>./spring-content-metadata-extraction</module>
<module>./spring-content-metadata-extraction-boot-starter</module>
<module>./spring-content-solr</module>
<module>./spring-content-solr-boot-starter</module>
<module>./spring-content-elasticsearch</module>
Expand Down
6 changes: 6 additions & 0 deletions spring-content-autoconfigure/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,12 @@
<version>3.0.17-SNAPSHOT</version>
<optional>true</optional>
</dependency>
<dependency>
<groupId>com.github.paulcwarren</groupId>
<artifactId>spring-content-metadata-extraction</artifactId>
<version>3.0.17-SNAPSHOT</version>
<optional>true</optional>
</dependency>
<!-- <dependency>-->
<!-- <groupId>com.github.paulcwarren</groupId>-->
<!-- <artifactId>spring-content-docx4j</artifactId>-->
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
/* Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
distributed with this work for additional information
regarding copyright ownership. The ASF licenses this file
to you under the Apache License, Version 2.0 (the
"License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing,
software distributed under the License is distributed on an
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
KIND, either express or implied. See the License for the
specific language governing permissions and limitations
under the License. */
package internal.org.springframework.content.metadataextraction.boot.autoconfigure;

import org.springframework.boot.autoconfigure.condition.ConditionalOnClass;
import org.springframework.content.metadataextraction.config.MetadataExtractionConfiguration;
import org.springframework.context.annotation.Configuration;
import org.springframework.context.annotation.Import;

/**
* Autoconfiguration class for enabling metadata extraction functionality in a Spring Boot application.
* <p>
* This configuration class is activated when the {@link MetadataExtractionConfiguration} class is present in the classpath.
* </p>
* <p>
* By including this class, metadata extraction features are automatically configured without requiring explicit registration
* of the necessary components, simplifying integration into the application context.
* </p>
*
* @author marcobelligoli
*/
@Configuration
@ConditionalOnClass(MetadataExtractionConfiguration.class)
@Import(MetadataExtractionConfiguration.class)
public class MetadataExtractionContentAutoConfiguration {

}
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ internal.org.springframework.content.fs.boot.autoconfigure.FilesystemContentAuto
internal.org.springframework.content.jpa.boot.autoconfigure.JpaContentAutoConfiguration
internal.org.springframework.content.mongo.boot.autoconfigure.MongoContentAutoConfiguration
internal.org.springframework.content.renditions.boot.autoconfigure.RenditionsContentAutoConfiguration
internal.org.springframework.content.metadataextraction.boot.autoconfigure.MetadataExtractionContentAutoConfiguration
internal.org.springframework.content.rest.boot.autoconfigure.ContentRestAutoConfiguration
internal.org.springframework.content.rest.boot.autoconfigure.HypermediaAutoConfiguration
internal.org.springframework.content.s3.boot.autoconfigure.S3ContentAutoConfiguration
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
/* Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
distributed with this work for additional information
regarding copyright ownership. The ASF licenses this file
to you under the Apache License, Version 2.0 (the
"License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing,
software distributed under the License is distributed on an
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
KIND, either express or implied. See the License for the
specific language governing permissions and limitations
under the License. */
package org.springframework.content.metadataextraction.boot;

import com.github.paulcwarren.ginkgo4j.Ginkgo4jConfiguration;
import com.github.paulcwarren.ginkgo4j.Ginkgo4jRunner;
import internal.org.springframework.content.s3.boot.autoconfigure.S3ContentAutoConfiguration;
import internal.org.springframework.content.solr.boot.autoconfigure.SolrAutoConfiguration;
import internal.org.springframework.content.solr.boot.autoconfigure.SolrExtensionAutoConfiguration;
import org.junit.runner.RunWith;
import org.springframework.boot.autoconfigure.AutoConfigurationPackage;
import org.springframework.boot.autoconfigure.EnableAutoConfiguration;
import org.springframework.content.commons.metadataextraction.MetadataExtractionService;
import org.springframework.content.commons.renditions.Renderable;
import org.springframework.content.commons.repository.ContentStore;
import org.springframework.content.metadataextraction.extractors.DefaultMetadataExtractor;
import org.springframework.context.annotation.AnnotationConfigApplicationContext;
import org.springframework.context.annotation.Configuration;
import org.springframework.context.annotation.EnableMBeanExport;
import org.springframework.jmx.support.RegistrationPolicy;
import org.springframework.support.TestEntity;

import static com.github.paulcwarren.ginkgo4j.Ginkgo4jDSL.Context;
import static com.github.paulcwarren.ginkgo4j.Ginkgo4jDSL.Describe;
import static com.github.paulcwarren.ginkgo4j.Ginkgo4jDSL.It;
import static org.hamcrest.CoreMatchers.is;
import static org.hamcrest.CoreMatchers.not;
import static org.hamcrest.CoreMatchers.nullValue;
import static org.hamcrest.MatcherAssert.assertThat;

/**
* Test class for verifying the functionality of the ContentMetadataExtractionAutoConfiguration.
*
* @author marcobelligoli
*/
@RunWith(Ginkgo4jRunner.class)
@Ginkgo4jConfiguration(threads = 1)
public class ContentMetadataExtractionAutoConfigurationTest {

{
Describe("ContentMetadataExtractionAutoConfiguration",
() -> Context("given a default configuration", () -> It("should load the all metadata extractors", () -> {

AnnotationConfigApplicationContext context = new AnnotationConfigApplicationContext();
context.register(TestConfig.class);
context.refresh();

assertThat(context.getBean(MetadataExtractionService.class), is(not(nullValue())));
assertThat(context.getBean(DefaultMetadataExtractor.class), is(not(nullValue())));

context.close();
})));
}

@Configuration
@AutoConfigurationPackage
@EnableAutoConfiguration(exclude = { SolrAutoConfiguration.class, SolrExtensionAutoConfiguration.class, S3ContentAutoConfiguration.class })
@EnableMBeanExport(registration = RegistrationPolicy.IGNORE_EXISTING)
public static class TestConfig {

}

public interface TestEntityContentStore extends ContentStore<TestEntity, String>, Renderable<TestEntity> {

}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
/* Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
distributed with this work for additional information
regarding copyright ownership. The ASF licenses this file
to you under the Apache License, Version 2.0 (the
"License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing,
software distributed under the License is distributed on an
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
KIND, either express or implied. See the License for the
specific language governing permissions and limitations
under the License. */
package internal.org.springframework.content.commons.metadataextraction;

import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.content.commons.metadataextraction.MetadataExtractionService;
import org.springframework.content.commons.metadataextraction.MetadataExtractor;

import java.io.File;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

/**
* Implementation of the {@link MetadataExtractionService} interface.
*
* @author marcobelligoli
*/
public class MetadataExtractionServiceImpl implements MetadataExtractionService {

private final List<MetadataExtractor> metadataExtractorList = new ArrayList<>();

@Autowired(required = false)
public MetadataExtractionServiceImpl(MetadataExtractor... metadataExtractors) {

Collections.addAll(this.metadataExtractorList, metadataExtractors);
}

@Override
public Map<String, Object> extractMetadata(File file) {

Map<String, Object> fullMetadataMap = new HashMap<>();
for (var metadataExtractor : metadataExtractorList) {
fullMetadataMap.putAll(metadataExtractor.extractMetadata(file));
}
return fullMetadataMap;
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
/* Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
distributed with this work for additional information
regarding copyright ownership. The ASF licenses this file
to you under the Apache License, Version 2.0 (the
"License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing,
software distributed under the License is distributed on an
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
KIND, either express or implied. See the License for the
specific language governing permissions and limitations
under the License. */
package org.springframework.content.commons.metadataextraction;

import java.io.File;
import java.util.Map;

/**
* A service that extracts metadata from a given file.
* <p>
* The extracted metadata will be returned as a map where the keys represent
* the metadata property names and the values represent the respective metadata values.
* </p>
* <p>
* This service retrieves all instances of {@link MetadataExtractor} present in the Spring context and,
* for each of them, performs metadata extraction from the provided file.
* </p>
*
* @author marcobelligoli
*/
public interface MetadataExtractionService {

/**
* Extracts metadata from the specified file.
*
* @param file the file from which metadata will be extracted
* @return a map containing the extracted metadata, where keys represent
* metadata property names, and values represent the respective metadata values
*/
Map<String, Object> extractMetadata(File file);
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
/* Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
distributed with this work for additional information
regarding copyright ownership. The ASF licenses this file
to you under the Apache License, Version 2.0 (the
"License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing,
software distributed under the License is distributed on an
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
KIND, either express or implied. See the License for the
specific language governing permissions and limitations
under the License. */
package org.springframework.content.commons.metadataextraction;

import java.io.File;
import java.util.Map;

/**
* Interface of MetadataExtractor component
*
* @author marcobelligoli
*/
public interface MetadataExtractor {

/**
* Extracts metadata from the given file.
*
* @param file the file from which metadata is to be extracted
* @return a map containing metadata as key-value pairs
*/
Map<String, Object> extractMetadata(File file);
}
Loading