Skip to content

new annotations ns that prepares us for larger api. #22

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
204 changes: 204 additions & 0 deletions src/org/clojurenlp/annotations.clj
Original file line number Diff line number Diff line change
@@ -0,0 +1,204 @@
(ns org.clojurenlp.annotations
(:import
(edu.stanford.nlp.dcoref
CorefCoreAnnotations$CorefChainAnnotation)
(edu.stanford.nlp.ling
CoreAnnotations
CoreAnnotations$AfterAnnotation
CoreAnnotations$AuthorAnnotation
CoreAnnotations$BeforeAnnotation
CoreAnnotations$CharacterOffsetBeginAnnotation
CoreAnnotations$CharacterOffsetEndAnnotation
CoreAnnotations$DocDateAnnotation
CoreAnnotations$DocIDAnnotation
CoreAnnotations$DocSourceTypeAnnotation
CoreAnnotations$DocTypeAnnotation
CoreAnnotations$IndexAnnotation
CoreAnnotations$KBPTriplesAnnotation
CoreAnnotations$LineNumberAnnotation
CoreAnnotations$LocationAnnotation
CoreAnnotations$MentionsAnnotation
CoreAnnotations$NamedEntityTagAnnotation
CoreAnnotations$NormalizedNamedEntityTagAnnotation
CoreAnnotations$PartOfSpeechAnnotation
CoreAnnotations$OriginalTextAnnotation
CoreAnnotations$QuotationIndexAnnotation
CoreAnnotations$QuotationsAnnotation
CoreAnnotations$SectionDateAnnotation
CoreAnnotations$SectionsAnnotation
CoreAnnotations$SentencesAnnotation
CoreAnnotations$SentenceBeginAnnotation
CoreAnnotations$SentenceEndAnnotation
CoreAnnotations$SentenceIDAnnotation
CoreAnnotations$SentenceIndexAnnotation
CoreAnnotations$SpeakerAnnotation
CoreAnnotations$TextAnnotation
CoreAnnotations$TokensAnnotation
CoreAnnotations$TokenBeginAnnotation
CoreAnnotations$TokenEndAnnotation
CoreAnnotations$TrueCaseAnnotation
CoreAnnotations$TrueCaseTextAnnotation
CoreAnnotations$WikipediaEntityAnnotation)
(edu.stanford.nlp.naturalli
NaturalLogicAnnotations
NaturalLogicAnnotations$RelationTriplesAnnotation)
(edu.stanford.nlp.neural.rnn
RNNCoreAnnotations)
(edu.stanford.nlp.pipeline
Annotation
QuoteAnnotator
QuoteAttributionAnnotator$SpeakerAnnotation)
(edu.stanford.nlp.semgraph
SemanticGraphCoreAnnotations$BasicDependenciesAnnotation
SemanticGraphCoreAnnotations$EnhancedPlusPlusDependenciesAnnotation
SemanticGraphCoreAnnotations$EnhancedDependenciesAnnotation)
(edu.stanford.nlp.sentiment
SentimentCoreAnnotations$SentimentAnnotatedTree
SentimentCoreAnnotations$SentimentClass)
(edu.stanford.nlp.time TimeAnnotations$TimexAnnotation)
(edu.stanford.nlp.trees
TreeCoreAnnotations$TreeAnnotation)
(edu.stanford.nlp.util CoreMap)
(org.ejml.simple SimpleMatrix)))



;;; Text

(defn get-text [ann] (.get ann CoreAnnotations$TextAnnotation))

(defn get-original-text [ann]
(.get ann CoreAnnotations$OriginalTextAnnotation))

(defn get-index [ann]
(.get ann CoreAnnotations$IndexAnnotation))

(defn get-character-offset-begin [ann]
(.get ann CoreAnnotations$CharacterOffsetBeginAnnotation))

(defn get-character-offset-end [ann]
(.get ann CoreAnnotations$CharacterOffsetEndAnnotation))

;; pos

(defn get-part-of-speech [ann]
(.get ann CoreAnnotations$PartOfSpeechAnnotation))

;;; Tokens

(defn get-tokens [^Annotation ann]
(.get ann CoreAnnotations$TokensAnnotation))

(defn get-token-begin [ann]
(.get ann CoreAnnotations$TokenBeginAnnotation))

(defn get-token-end [ann]
(.get ann CoreAnnotations$TokenEndAnnotation))

(defn get-speaker [token]
(.get token CoreAnnotations$SpeakerAnnotation))

(defn get-true-case [token]
(.get token CoreAnnotations$TrueCaseAnnotation))

(defn get-true-case-text [token]
(.get token CoreAnnotations$TrueCaseTextAnnotation))
;
(defn get-before [token]
(.get token CoreAnnotations$BeforeAnnotation))

(defn get-after [token]
(.get token CoreAnnotations$AfterAnnotation))


;;; Document

(defn get-doc-id [doc] (.get doc CoreAnnotations$DocIDAnnotation))
(defn get-doc-date [doc] (.get doc CoreAnnotations$DocDateAnnotation))
(defn get-doc-source-type [doc] (.get doc CoreAnnotations$DocSourceTypeAnnotation))
(defn get-doc-type [doc] (.get doc CoreAnnotations$DocTypeAnnotation))
(defn get-author [doc] (.get doc CoreAnnotations$AuthorAnnotation))
(defn get-location [doc] (.get doc CoreAnnotations$LocationAnnotation))

;;; Sentences

(defn get-sentences [ann] (.get ann CoreAnnotations$SentencesAnnotation))
#_(defn get-before [sentence] edu.stanford.nlp.ling.CoreAnnotations$BeforeAnnotation)
#_(defn get-after [sentence] CoreAnnotations$AfterAnnotation)

(defn get-sentence-id [sentence]
(.get sentence CoreAnnotations$SentenceIDAnnotation))
(defn get-sentence-index [sentence]
(.get sentence CoreAnnotations$SentenceIndexAnnotation))
(defn get-line-number [sentence]
(.get sentence CoreAnnotations$LineNumberAnnotation))
(defn get-tree [sentence]
(.get sentence TreeCoreAnnotations$TreeAnnotation)) ; note the "Tree"Core.
(defn get-basic-dependencies [sentence]
(.get sentence SemanticGraphCoreAnnotations$BasicDependenciesAnnotation))
(defn get-enhanced-dependencies [sentence]
(.get sentence SemanticGraphCoreAnnotations$EnhancedDependenciesAnnotation))
(defn get-enhanced-plus-plus-dependencies [sentence]
(.get sentence SemanticGraphCoreAnnotations$EnhancedPlusPlusDependenciesAnnotation))

;; Sentiment

(defn get-sentiment-class [sentence]
(.get sentence SentimentCoreAnnotations$SentimentClass))

(defn get-sentiment-annotated-tree [sentence]
(.get sentence SentimentCoreAnnotations$SentimentAnnotatedTree))

(defn get-sentiment [sentiment-tree]
(RNNCoreAnnotations/getPredictedClass sentiment-tree))

#_(defn get-sentiment-predictions [sentiment-tree]
(RNNCoreAnnotations/getPredictionsAsStringList sentiment-tree))

;; OpenIE
(defn get-relation-triples [sentence]
(.get sentence NaturalLogicAnnotations$RelationTriplesAnnotation))

;; KBP
(defn get-kbp-triples [sentence]
(.get sentence CoreAnnotations$KBPTriplesAnnotation))

;; Entity mentions
(defn get-mentions [sentence]
(.get sentence CoreAnnotations$MentionsAnnotation))
(defn get-named-entity-tag [mention]
(.get mention CoreAnnotations$NamedEntityTagAnnotation))
(defn get-normalized-named-entity-tag [mention]
(.get mention CoreAnnotations$NormalizedNamedEntityTagAnnotation))
(defn get-wikipedia-entity [mention]
(.get mention CoreAnnotations$WikipediaEntityAnnotation))
(defn get-time [ann] (.get ann TimeAnnotations$TimexAnnotation))

;;; Quotes

(defn get-quotations [ann]
(.get ann CoreAnnotations$QuotationsAnnotation))

(defn gather-quotes [ann] (QuoteAnnotator/gatherQuotes ann))

(defn get-quotation-index [quote]
(.get quote CoreAnnotations$QuotationIndexAnnotation))

(defn get-sentence-begin [quote]
(.get quote CoreAnnotations$SentenceBeginAnnotation))

(defn get-sentence-end [quote]
(.get quote CoreAnnotations$SentenceEndAnnotation))

(defn get-speaker [quote]
(.get quote QuoteAttributionAnnotator$SpeakerAnnotation))

;; Sections
(defn get-sections [ann] (.get ann CoreAnnotations$SectionsAnnotation))
(defn get-section-date [section]
(.get section CoreAnnotations$SectionDateAnnotation))

;; corefs
(defn get-coref-chain [doc]
(.get doc CorefCoreAnnotations$CorefChainAnnotation))

52 changes: 24 additions & 28 deletions src/org/clojurenlp/core.clj
Original file line number Diff line number Diff line change
@@ -1,30 +1,27 @@
(ns org.clojurenlp.core
(:require
[org.clojurenlp.annotations :as ann]
[clojure.data.json :as json]
[clojure.set :as set]
[loom.attr :as attr]
[loom.graph :as graph])
(:import (java.io StringReader)
(java.util ArrayList
Collection
Map
Properties)
(edu.stanford.nlp.process DocumentPreprocessor
PTBTokenizer)
(edu.stanford.nlp.ling CoreLabel TaggedWord Word)
(edu.stanford.nlp.tagger.maxent MaxentTagger)
(edu.stanford.nlp.trees LabeledScoredTreeNode
LabeledScoredTreeReaderFactory
PennTreebankLanguagePack
TypedDependency)
(edu.stanford.nlp.parser.common ParserGrammar)
(edu.stanford.nlp.parser.lexparser LexicalizedParser)
(edu.stanford.nlp.pipeline Annotation StanfordCoreNLP)
(edu.stanford.nlp.ling CoreAnnotations$SentencesAnnotation
CoreAnnotations$TextAnnotation
CoreAnnotations$NamedEntityTagAnnotation
CoreAnnotations$TokensAnnotation
Word))
(:import
(java.io StringReader)
(java.util ArrayList
Collection
Map
Properties)
(edu.stanford.nlp.process DocumentPreprocessor
PTBTokenizer)
(edu.stanford.nlp.ling CoreLabel TaggedWord Word)
(edu.stanford.nlp.tagger.maxent MaxentTagger)
(edu.stanford.nlp.trees LabeledScoredTreeNode
LabeledScoredTreeReaderFactory
PennTreebankLanguagePack
TypedDependency)
(edu.stanford.nlp.parser.common ParserGrammar)
(edu.stanford.nlp.parser.lexparser LexicalizedParser)
(edu.stanford.nlp.pipeline Annotation StanfordCoreNLP))
(:gen-class :main true))

(defn pprint-methods!
Expand Down Expand Up @@ -52,12 +49,11 @@
(defn tokenize [text]
(let [core-labels (tokenize-corelabels text)]
(map #(assoc {}
:token (.get % CoreAnnotations$TextAnnotation)
:token (ann/get-text %)
:start-offset (.beginPosition %)
:end-offset (.endPosition %))
core-labels)))


(defn split-sentences [text]
"Split a string into a sequence of sentences, each of which is a sequence of CoreLabels"
(let [rdr (StringReader. text)]
Expand All @@ -72,7 +68,7 @@
(last (map #(.endPosition %) core-labels)))

(defn sentence-text [core-labels]
(map #(.get % CoreAnnotations$TextAnnotation) core-labels))
(map ann/get-text core-labels))


(defn sentenize [text]
Expand Down Expand Up @@ -150,16 +146,16 @@
(defn- get-tokens-entities
"builds map: {:token token :named-entity named-entity}"
[tok-ann]
{:token (.get tok-ann CoreAnnotations$TextAnnotation)
:named-entity (.get tok-ann CoreAnnotations$NamedEntityTagAnnotation)
{:token (ann/get-text tok-ann)
:named-entity (ann/get-named-entity-tag tok-ann)
:start-offset (.beginPosition tok-ann)
:end-offset (.endPosition tok-ann)})

(defn- get-token-annotations
"Passes TokenAnnotations extracted from SentencesAnnotation to get-tokens-entities
which returns a map {:token token :named-entity ne}"
[sentence-annotation]
(map get-tokens-entities (.get sentence-annotation CoreAnnotations$TokensAnnotation)))
(map get-tokens-entities (ann/get-tokens sentence-annotation)))

(defn- get-text-tokens [sen-ann]
"builds map: {:tokens tokens}"
Expand All @@ -170,7 +166,7 @@
get-text-tokens which returns a map:
{:tokens {:token token :named-entity ne}}"
[^Annotation annotation]
(map get-text-tokens (.get annotation CoreAnnotations$SentencesAnnotation)))
(map get-text-tokens (ann/get-sentences annotation)))

(defn tag-ner
"Returns a map object containing original text, tokens, sentences"
Expand Down