Skip to content

Commit f8692a1

Browse files
committed
feat: Add support for Tika MIME Types
This mostly adds extensions from the Tika data. It is not clear whether this will be acceptable because of the difference between the MIT and Apache 2 licences, and it is far too late to switch to the Apache 2 licence for this data since I am not the only contributor.
1 parent 2afe970 commit f8692a1

File tree

2 files changed

+125
-1
lines changed

2 files changed

+125
-1
lines changed

Rakefile

+7-1
Original file line numberDiff line numberDiff line change
@@ -41,11 +41,17 @@ namespace :mime do
4141
IANARegistry.download(to: args.destination)
4242
end
4343

44-
desc "Download the current MIME type configuration from Apache."
44+
desc "Download the current MIME type configuration from Apache httpd."
4545
task :apache, [:destination] do |_, args|
4646
require "apache_mime_types"
4747
ApacheMIMETypes.download(to: args.destination)
4848
end
49+
50+
desc "Download the current MIME type configuration from Apache Tika."
51+
task :tika, [:destination] do |_, args|
52+
require "tika_mime_types"
53+
TikeMIMETypes.download(to: args.destination)
54+
end
4955
end
5056

5157
task :version do

support/tika_mime_types.rb

+118
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,118 @@
1+
# frozen_string_literal: true
2+
3+
$LOAD_PATH.unshift File.expand_path("../../lib", __FILE__)
4+
5+
require "open-uri"
6+
require "nokogiri"
7+
require "cgi"
8+
require "pathname"
9+
require "yaml"
10+
require "English"
11+
12+
require "mime/types/support"
13+
14+
# Update MIME types from the Tika MIME types
15+
class TikeMIMETypes
16+
DEFAULTS = {
17+
urls: ["https://github.com/apache/tika/raw/refs/heads/main/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml"],
18+
to: Pathname(__FILE__).join("../../types")
19+
}.freeze.each_value(&:freeze)
20+
21+
def self.download(options = {})
22+
dest = Pathname(options[:to] || DEFAULTS[:to]).expand_path
23+
urls = options.fetch(:urls, DEFAULTS[:urls])
24+
25+
puts "Downloading Apache Tika MIME type list."
26+
27+
urls.each do |url|
28+
puts "\t#{url}"
29+
30+
new(dest)
31+
.parse(Nokogiri::XML(URI.parse(url).open(&:read)).xpath("/mime-info/mime-type"))
32+
.save
33+
end
34+
end
35+
36+
def initialize(to)
37+
@to = Pathname(to).expand_path
38+
@registries = {}
39+
end
40+
41+
def parse(records)
42+
records.each do |record|
43+
content_type = record["type"]
44+
45+
# Do not process any records where the subtype includes attributes like format or
46+
# version. MIME::Types is not built for this specific behaviour.
47+
next if content_type =~ /;/
48+
49+
extensions = record.css("glob").map { |glob|
50+
if glob["isregex"]
51+
glob["pattern"].gsub(/\A\^|\$\z/, "")
52+
elsif glob["pattern"].start_with?("*.")
53+
glob["pattern"].sub(/^\*\./, "")
54+
elsif glob["pattern"] =~ /\A\.?[-\w]+\z/
55+
glob["pattern"]
56+
end
57+
}.compact.map(&:downcase)
58+
59+
type, _ = content_type.split("/", 2)
60+
type.gsub!(/\Ax-/, "")
61+
62+
registry = registry_for(type)
63+
64+
existing_types = registry[:types].select { |t| t.content_type.casecmp(content_type).zero? }
65+
66+
if existing_types.empty?
67+
MIME::Type.new(content_type) do |mt|
68+
mt.extensions = extensions
69+
registry[:types].add_type(mt, true)
70+
end
71+
else
72+
existing_types.each do |mt|
73+
mt.add_extensions(extensions)
74+
end
75+
end
76+
end
77+
78+
self
79+
end
80+
81+
def save
82+
@to.mkpath
83+
84+
@registries.each_value { |registry|
85+
File.open(registry[:file], "wb") { |f|
86+
f.puts registry[:types]
87+
.map
88+
.to_a
89+
.sort { |a, b| a.content_type.casecmp(b.content_type) }
90+
.uniq
91+
.to_yaml
92+
}
93+
}
94+
end
95+
96+
private
97+
98+
def registry_for(type)
99+
unless @registries[type]
100+
name = "#{type}.yaml"
101+
file = @to.join(name)
102+
@registries[type] = {
103+
file: file,
104+
types: mime_types_for(file)
105+
}
106+
end
107+
108+
@registries[type]
109+
end
110+
111+
def mime_types_for(file)
112+
MIME::Types.new.tap do |container|
113+
if file.exist?
114+
container.add(*MIME::Types::Loader.load_from_yaml(file), :silent)
115+
end
116+
end
117+
end
118+
end

0 commit comments

Comments
 (0)