Skip to content

Commit 8d83b31

Browse files
committed
add geo info
1 parent 4c73b83 commit 8d83b31

File tree

2 files changed

+247
-6
lines changed

2 files changed

+247
-6
lines changed

.github/workflows/entities.yml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,8 @@ jobs:
1717
runs-on: ubuntu-latest
1818
container:
1919
image: credocker/crepython:2020.0
20+
env:
21+
GEONAMES_KEY: ${{ secrets.GEONAMES_KEY }}
2022
volumes:
2123
- ${{ github.workspace }}:/cre/python
2224
steps:
@@ -29,8 +31,12 @@ jobs:
2931
# ref: main
3032
- name: cd /cre/python/
3133
run: (cd /cre/python/)
34+
- name: cp mysecrets.py
35+
run: (cp mysecrets.orig.py mysecrets.py)
3236
- name: Install spacy textblob
3337
run: (pip3 install spacy==2.2.4 textblob==0.15.3 nltk==3.8)
38+
- name: Install geocoder
39+
run: (pip3 install geocoder geopandas==0.9)
3440
- name: Install download en_core_web_md
3541
run: (python3 -m spacy download en_core_web_md)
3642
- name: Run entities

entities.py

Lines changed: 241 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import mysecrets
12
import pandas as pd
23

34
from pathlib import Path
@@ -12,6 +13,10 @@
1213
# pip3 install spacy
1314
# python3 -m spacy download en_core_web_md
1415
#pip3 install textblob
16+
import requests
17+
import json
18+
import geocoder
19+
import geopandas
1520

1621
import nltk
1722
import spacy
@@ -27,6 +32,14 @@
2732
if(not os.path.exists(DATA_PATH / 'csv')):
2833
os.mkdir(DATA_PATH / 'csv')
2934

35+
ipccRegions = geopandas.read_file('https://github.com/creDocker/creAssets/blob/main/cre/versions/u24.04/assets/public/ipcc/IPCC-WGI-reference-regions-v4.geojson?raw=true')
36+
37+
countriesInfo = pd.read_csv("https://github.com/creDocker/creAssets/blob/main/cre/versions/u24.04/assets/public/geonames/countryInfo.csv?raw=true")
38+
countriesGeo = geopandas.read_file('https://raw.githubusercontent.com/creDocker/creAssets/refs/heads/main/cre/versions/u24.04/assets/public/geonames/shapes_countries.json')
39+
countriesGeo['geoNameId'] = countriesGeo['geoNameId'].astype(int)
40+
countriesInfo['geonameid'] = countriesInfo['geonameid'].astype(int)
41+
countriesDf = pd.merge(countriesGeo, countriesInfo, left_on='geoNameId', right_on='geonameid')
42+
3043
def getNewsFiles():
3144
fileName = './csv/news_????_??.csv'
3245
files = glob.glob(fileName)
@@ -138,6 +151,222 @@ def groupSentiments(df, aggColumn):
138151
indexMisc = {}
139152
indexMissing = {}
140153

154+
foundGeonames = False
155+
geonamesKey = 'GEONAMES_KEY'
156+
geonamesKey = os.getenv('GEONAMES_KEY')
157+
if(geonamesKey):
158+
foundGeonames = True
159+
if(geonamesKey == '1a2b3c4d5'):
160+
print('Please set geonames.org key in file: secrets.py');
161+
foundGeonames = False
162+
if(geonamesKey == 'demo_demo_123'):
163+
print('Please set geonames.org key in file: secrets.py');
164+
foundGeonames = False
165+
print(['foundGeonames',foundGeonames])
166+
#foundGeonames = True
167+
168+
geomax = 250
169+
def enrichFromGeonames(df):
170+
global geomax
171+
print('Starting with geonames')
172+
if(not foundGeonames):
173+
print('geonames not found')
174+
return df
175+
for index, column in df.iterrows():
176+
if(geomax>0):
177+
lang = str(column.language)
178+
phrase = str(column.phrase)
179+
if(str(column.geonames) == '-1'):
180+
print('things to do')
181+
gn = geocoder.geonames(phrase, lang=lang, key=geonamesKey)
182+
print([phrase,gn,gn.geonames_id])
183+
if(gn.geonames_id):
184+
df.loc[index,'geonames'] = int(gn.geonames_id)
185+
df.loc[index,'latitude'] = float(gn.lat)
186+
df.loc[index,'longitude'] = float(gn.lng)
187+
df.loc[index,'geotype'] = gn.feature_class
188+
##df.loc[index,'country'] = gn.country #localized!
189+
gne = geocoder.geonames(phrase, lang='en', key=geonamesKey)
190+
if(gne.country):
191+
df.loc[index,'country'] = gne.country
192+
print(['geo',gn.lat,gn.lng, gn])
193+
194+
#(get country) get ipcc
195+
coordinates = geopandas.points_from_xy([float(gn.lng)], [float(gn.lat)])
196+
print(['points_from_xy',coordinates])
197+
Coords = geopandas.GeoDataFrame({
198+
'geometry': coordinates,
199+
'name': [phrase]
200+
}, crs={'init': 'epsg:4326', 'no_defs': True})
201+
print(['GeoDataFrame',Coords])
202+
whichIpcc = geopandas.sjoin(ipccRegions, Coords, how='inner', op='intersects')
203+
print(whichIpcc)
204+
if(not whichIpcc.empty):
205+
df.loc[index,'ipcc'] = list(whichIpcc['Acronym'])[0]
206+
df.loc[index,'continent'] = list(whichIpcc['Continent'])[0]
207+
whichCountry = geopandas.sjoin(countriesDf, Coords, how='inner', op='intersects')
208+
print(whichCountry)
209+
if(not whichCountry.empty):
210+
df.loc[index,'country'] = list(whichCountry['Country'])[0]
211+
212+
#get GND
213+
found = False
214+
gnd = searchGndByGeonamesId(gn.geonames_id)
215+
if(gnd and 'gndId' in gnd):
216+
df.loc[index,'gnd'] = str(gnd['gndId'])
217+
found = True
218+
if(not found):
219+
gnd = searchGndByNameAndGeo(phrase, float(gn.lat), float(gn.lng))
220+
if(gnd and 'gndId' in gnd):
221+
df.loc[index,'gnd'] = str(gnd['gndId'])
222+
found = True
223+
if(not found):
224+
gnd = searchGndByName(phrase)
225+
if(gnd and 'gndId' in gnd):
226+
df.loc[index,'gnd'] = str(gnd['gndId'])
227+
found = True
228+
229+
else:
230+
print(['geonames found nothing',phrase,gn,gn.geonames_id])
231+
df.loc[index,'geonames'] = 0
232+
233+
geomax -= 1
234+
time.sleep(0.1)
235+
return df
236+
237+
def searchGndByGeonamesId(geonamesId):
238+
gndurl = 'https://lobid.org/gnd/search?q='+str(geonamesId)+'&filter=type%3APlaceOrGeographicName&format=json' #hasGeometry
239+
page = requests.get(gndurl, timeout=60)
240+
if page.status_code == 200:
241+
content = page.content
242+
#print(content)
243+
if(content):
244+
#print(content)
245+
jsonData = json.loads(content)
246+
#print(jsonData) #'variantName' !
247+
if('member' in jsonData):
248+
for member in jsonData['member']:
249+
if('sameAs' in member):
250+
for same in member['sameAs']:
251+
#print(25*"##")
252+
#print(same)
253+
if('id' in same):
254+
if(same['id']=="https://sws.geonames.org/"+str(geonamesId)):
255+
if('gndIdentifier' in member):
256+
result = {'gndId':member['gndIdentifier']}
257+
#print(member['gndIdentifier'])
258+
#print(25*"=*")
259+
#print(member)
260+
if('hasGeometry' in member):
261+
#print(member['hasGeometry'])
262+
latitude = None
263+
longitude = None
264+
for geo in member['hasGeometry']:
265+
if('asWKT' in geo and 'type' in geo and geo['type']=='Point'):
266+
point = geo['asWKT'][0]
267+
point = point.replace('Point ','').strip().strip('()').strip()
268+
#print(point)
269+
coords = point.split(" ")
270+
#print(coords)
271+
result['longitude'] = float(coords[0])
272+
result['latitude'] = float(coords[1])
273+
if('variantName' in member):
274+
#print(member['variantName'])
275+
result['variantNames'] = member['variantName']
276+
if('preferredName' in member):
277+
#print(member['preferredName'])
278+
result['preferredName'] = member['preferredName']
279+
return result
280+
return None
281+
282+
def searchGndByNameAndGeo(locationName, latitude, longitude, maxDistance=10):
283+
gndUrl = 'https://explore.gnd.network/search?term='+locationName+'&f.satzart=Geografikum&rows=1'
284+
gndurl = 'https://lobid.org/gnd/search?q='+locationName+'&filter=type%3APlaceOrGeographicName&format=json' #hasGeometry
285+
page = requests.get(gndurl, timeout=60)
286+
if page.status_code == 200:
287+
content = page.content
288+
#print(content)
289+
if(content):
290+
#print(content)
291+
jsonData = json.loads(content)
292+
#print(jsonData) #'variantName' !
293+
if('member' in jsonData):
294+
minDistance2 = 10E9
295+
result = None
296+
for member in jsonData['member']:
297+
#print(25*"=*")
298+
#print(member)
299+
if('hasGeometry' in member):
300+
#print(member['hasGeometry'])
301+
for geo in member['hasGeometry']:
302+
if('asWKT' in geo and 'type' in geo and geo['type']=='Point'):
303+
point = geo['asWKT'][0]
304+
point = point.replace('Point ','').strip().strip('()').strip()
305+
#print(point)
306+
coords = point.split(" ")
307+
#print(coords)
308+
currLongitude = float(coords[0])
309+
currLatitude = float(coords[1])
310+
distance2 = (currLongitude-longitude)**2+(currLatitude-latitude)**2
311+
#print(distance2)
312+
if(distance2<minDistance2):
313+
minDistance = distance2
314+
if('gndIdentifier' in member):
315+
#print(member['gndIdentifier'])
316+
result = {'longitude':currLongitude, 'latitude':currLatitude, 'distance':distance2**0.5}
317+
result['gndId'] = member['gndIdentifier']
318+
if('preferredName' in member):
319+
#print(member['preferredName'])
320+
result['preferredName'] = member['preferredName']
321+
#print(result)
322+
if(minDistance2<maxDistance**2):
323+
return result
324+
return None
325+
326+
def searchGndByName(locationName):
327+
gndUrl = 'https://explore.gnd.network/search?term='+locationName+'&f.satzart=Geografikum&rows=1'
328+
gndurl = 'https://lobid.org/gnd/search?q='+locationName+'&filter=type%3APlaceOrGeographicName&format=json' #hasGeometry
329+
page = requests.get(gndurl, timeout=60)
330+
if page.status_code == 200:
331+
content = page.content
332+
#print(content)
333+
if(content):
334+
#print(content)
335+
jsonData = json.loads(content)
336+
#print(jsonData) #'variantName' !
337+
if('member' in jsonData):
338+
for member in jsonData['member']:
339+
#print(25*"=*")
340+
#print(member)
341+
if('gndIdentifier' in member):
342+
#print(member['gndIdentifier'])
343+
result = {'gndId':member['gndIdentifier']}
344+
if('hasGeometry' in member):
345+
#print(member['hasGeometry'])
346+
latitude = None
347+
longitude = None
348+
for geo in member['hasGeometry']:
349+
if('asWKT' in geo and 'type' in geo and geo['type']=='Point'):
350+
point = geo['asWKT'][0]
351+
point = point.replace('Point ','').strip().strip('()').strip()
352+
#print(point)
353+
coords = point.split(" ")
354+
#print(coords)
355+
result['longitude'] = float(coords[0])
356+
result['latitude'] = float(coords[1])
357+
found = False
358+
if('variantName' in member):
359+
#print(member['variantName'])
360+
result['variantNames'] = member['variantName']
361+
found = locationName in member['variantName']
362+
if('preferredName' in member):
363+
#print(member['preferredName'])
364+
result['preferredName'] = member['preferredName']
365+
found = found or (member['preferredName'] == locationName)
366+
if(found):
367+
return result
368+
return None
369+
141370
def strangeCharacters(testString, testCharacters):
142371
count = 0
143372
for oneCharacter in testCharacters:
@@ -161,14 +390,14 @@ def strangeCharacters(testString, testCharacters):
161390

162391
if(entity.label_ in ['LOC','GPE']):
163392
if(entity.text in indexLocations):
164-
indexLocations[entity.text]['count'] += 1
393+
indexLocations[entity.text]['count'] += 1 #TODO add valid value...
165394
indexLocations[entity.text]['sentiment'] += sentence.sentiment.polarity
166395
indexLocations[entity.text]['subjectivity'] += sentence.sentiment.subjectivity
167396
else:
168397
indexLocations[entity.text] = {'phrase':entity.text, 'label':entity.label_, 'sentiment':sentence.sentiment.polarity,
169398
'subjectivity':sentence.sentiment.subjectivity, 'language':lang, 'count':1,
170-
'geonames':-1, 'geotype':None, 'latitude':None, 'longitude':None,
171-
'country':None, 'ipcc':None}
399+
'gnd':None, 'geonames':-1, 'geotype':None, 'latitude':None, 'longitude':None,
400+
'continent':None, 'country':None, 'ipcc':None}
172401
if ('geonames' in oldLocationsDf.columns):
173402
foundInOlDf = oldLocationsDf[oldLocationsDf['phrase']==entity.text]
174403
foundInOlDf = foundInOlDf[foundInOlDf['geonames']>-0.5]
@@ -180,6 +409,10 @@ def strangeCharacters(testString, testCharacters):
180409
indexLocations[entity.text]['longitude'] = float(foundInOlDf['longitude'].mean())
181410
indexLocations[entity.text]['country'] = foundInOlDf['country'].min()
182411
indexLocations[entity.text]['ipcc'] = foundInOlDf['ipcc'].min()
412+
if('continent' in foundInOlDf.columns):
413+
indexLocations[entity.text]['continent'] = foundInOlDf['continent'].min()
414+
if('gnd' in foundInOlDf.columns):
415+
indexLocations[entity.text]['gnd'] = foundInOlDf['gnd'].min()
183416

184417

185418
elif(entity.label_ in ['PER','PERSON']):
@@ -219,13 +452,15 @@ def strangeCharacters(testString, testCharacters):
219452
indexMissing[entity.text] = {'phrase':entity.text, 'label':entity.label_, 'sentiment':sentence.sentiment.polarity,
220453
'subjectivity':sentence.sentiment.subjectivity, 'language':lang, 'count':1}
221454

222-
colSent = ['phrase', 'label', 'sentiment', 'subjectivity', 'language', 'count', 'geonames', 'geotype', 'latitude', 'longitude', 'country', 'ipcc']
223-
indexLocationsDF = pd.DataFrame.from_dict(indexLocations, orient='index', columns=colSent)
455+
colGeo = ['phrase', 'label', 'sentiment', 'subjectivity', 'language', 'count',
456+
'gnd', 'geonames', 'geotype', 'latitude', 'longitude', 'continent', 'country', 'ipcc']
457+
indexLocationsDF = pd.DataFrame.from_dict(indexLocations, orient='index', columns=colGeo)
224458
indexLocationsDF['sentiment'] = indexLocationsDF['sentiment']/indexLocationsDF['count']
225459
indexLocationsDF['subjectivity'] = indexLocationsDF['subjectivity']/indexLocationsDF['count']
226460
indexLocationsDF = indexLocationsDF.sort_values(by=['count'], ascending=False)
227-
indexLocationsDF.to_csv(DATA_PATH / 'csv' / "sentiments_locations.csv", index=True)
461+
indexLocationsDF.to_csv(DATA_PATH / 'csv' / "sentiments_locations.csv", index=True, float_format='%.8f')
228462

463+
colSent = ['phrase', 'label', 'sentiment', 'subjectivity', 'language', 'count']
229464
indexPersonsDF = pd.DataFrame.from_dict(indexPersons, orient='index', columns=colSent)
230465
indexPersonsDF['sentiment'] = indexPersonsDF['sentiment']/indexPersonsDF['count']
231466
indexPersonsDF['subjectivity'] = indexPersonsDF['subjectivity']/indexPersonsDF['count']

0 commit comments

Comments
 (0)