1+ import mysecrets
12import pandas as pd
23
34from pathlib import Path
1213# pip3 install spacy
1314# python3 -m spacy download en_core_web_md
1415#pip3 install textblob
16+ import requests
17+ import json
18+ import geocoder
19+ import geopandas
1520
1621import nltk
1722import spacy
2732if (not os .path .exists (DATA_PATH / 'csv' )):
2833 os .mkdir (DATA_PATH / 'csv' )
2934
35+ ipccRegions = geopandas .read_file ('https://github.com/creDocker/creAssets/blob/main/cre/versions/u24.04/assets/public/ipcc/IPCC-WGI-reference-regions-v4.geojson?raw=true' )
36+
37+ countriesInfo = pd .read_csv ("https://github.com/creDocker/creAssets/blob/main/cre/versions/u24.04/assets/public/geonames/countryInfo.csv?raw=true" )
38+ countriesGeo = geopandas .read_file ('https://raw.githubusercontent.com/creDocker/creAssets/refs/heads/main/cre/versions/u24.04/assets/public/geonames/shapes_countries.json' )
39+ countriesGeo ['geoNameId' ] = countriesGeo ['geoNameId' ].astype (int )
40+ countriesInfo ['geonameid' ] = countriesInfo ['geonameid' ].astype (int )
41+ countriesDf = pd .merge (countriesGeo , countriesInfo , left_on = 'geoNameId' , right_on = 'geonameid' )
42+
3043def getNewsFiles ():
3144 fileName = './csv/news_????_??.csv'
3245 files = glob .glob (fileName )
@@ -138,6 +151,222 @@ def groupSentiments(df, aggColumn):
138151indexMisc = {}
139152indexMissing = {}
140153
154+ foundGeonames = False
155+ geonamesKey = 'GEONAMES_KEY'
156+ geonamesKey = os .getenv ('GEONAMES_KEY' )
157+ if (geonamesKey ):
158+ foundGeonames = True
159+ if (geonamesKey == '1a2b3c4d5' ):
160+ print ('Please set geonames.org key in file: secrets.py' );
161+ foundGeonames = False
162+ if (geonamesKey == 'demo_demo_123' ):
163+ print ('Please set geonames.org key in file: secrets.py' );
164+ foundGeonames = False
165+ print (['foundGeonames' ,foundGeonames ])
166+ #foundGeonames = True
167+
168+ geomax = 250
169+ def enrichFromGeonames (df ):
170+ global geomax
171+ print ('Starting with geonames' )
172+ if (not foundGeonames ):
173+ print ('geonames not found' )
174+ return df
175+ for index , column in df .iterrows ():
176+ if (geomax > 0 ):
177+ lang = str (column .language )
178+ phrase = str (column .phrase )
179+ if (str (column .geonames ) == '-1' ):
180+ print ('things to do' )
181+ gn = geocoder .geonames (phrase , lang = lang , key = geonamesKey )
182+ print ([phrase ,gn ,gn .geonames_id ])
183+ if (gn .geonames_id ):
184+ df .loc [index ,'geonames' ] = int (gn .geonames_id )
185+ df .loc [index ,'latitude' ] = float (gn .lat )
186+ df .loc [index ,'longitude' ] = float (gn .lng )
187+ df .loc [index ,'geotype' ] = gn .feature_class
188+ ##df.loc[index,'country'] = gn.country #localized!
189+ gne = geocoder .geonames (phrase , lang = 'en' , key = geonamesKey )
190+ if (gne .country ):
191+ df .loc [index ,'country' ] = gne .country
192+ print (['geo' ,gn .lat ,gn .lng , gn ])
193+
194+ #(get country) get ipcc
195+ coordinates = geopandas .points_from_xy ([float (gn .lng )], [float (gn .lat )])
196+ print (['points_from_xy' ,coordinates ])
197+ Coords = geopandas .GeoDataFrame ({
198+ 'geometry' : coordinates ,
199+ 'name' : [phrase ]
200+ }, crs = {'init' : 'epsg:4326' , 'no_defs' : True })
201+ print (['GeoDataFrame' ,Coords ])
202+ whichIpcc = geopandas .sjoin (ipccRegions , Coords , how = 'inner' , op = 'intersects' )
203+ print (whichIpcc )
204+ if (not whichIpcc .empty ):
205+ df .loc [index ,'ipcc' ] = list (whichIpcc ['Acronym' ])[0 ]
206+ df .loc [index ,'continent' ] = list (whichIpcc ['Continent' ])[0 ]
207+ whichCountry = geopandas .sjoin (countriesDf , Coords , how = 'inner' , op = 'intersects' )
208+ print (whichCountry )
209+ if (not whichCountry .empty ):
210+ df .loc [index ,'country' ] = list (whichCountry ['Country' ])[0 ]
211+
212+ #get GND
213+ found = False
214+ gnd = searchGndByGeonamesId (gn .geonames_id )
215+ if (gnd and 'gndId' in gnd ):
216+ df .loc [index ,'gnd' ] = str (gnd ['gndId' ])
217+ found = True
218+ if (not found ):
219+ gnd = searchGndByNameAndGeo (phrase , float (gn .lat ), float (gn .lng ))
220+ if (gnd and 'gndId' in gnd ):
221+ df .loc [index ,'gnd' ] = str (gnd ['gndId' ])
222+ found = True
223+ if (not found ):
224+ gnd = searchGndByName (phrase )
225+ if (gnd and 'gndId' in gnd ):
226+ df .loc [index ,'gnd' ] = str (gnd ['gndId' ])
227+ found = True
228+
229+ else :
230+ print (['geonames found nothing' ,phrase ,gn ,gn .geonames_id ])
231+ df .loc [index ,'geonames' ] = 0
232+
233+ geomax -= 1
234+ time .sleep (0.1 )
235+ return df
236+
237+ def searchGndByGeonamesId (geonamesId ):
238+ gndurl = 'https://lobid.org/gnd/search?q=' + str (geonamesId )+ '&filter=type%3APlaceOrGeographicName&format=json' #hasGeometry
239+ page = requests .get (gndurl , timeout = 60 )
240+ if page .status_code == 200 :
241+ content = page .content
242+ #print(content)
243+ if (content ):
244+ #print(content)
245+ jsonData = json .loads (content )
246+ #print(jsonData) #'variantName' !
247+ if ('member' in jsonData ):
248+ for member in jsonData ['member' ]:
249+ if ('sameAs' in member ):
250+ for same in member ['sameAs' ]:
251+ #print(25*"##")
252+ #print(same)
253+ if ('id' in same ):
254+ if (same ['id' ]== "https://sws.geonames.org/" + str (geonamesId )):
255+ if ('gndIdentifier' in member ):
256+ result = {'gndId' :member ['gndIdentifier' ]}
257+ #print(member['gndIdentifier'])
258+ #print(25*"=*")
259+ #print(member)
260+ if ('hasGeometry' in member ):
261+ #print(member['hasGeometry'])
262+ latitude = None
263+ longitude = None
264+ for geo in member ['hasGeometry' ]:
265+ if ('asWKT' in geo and 'type' in geo and geo ['type' ]== 'Point' ):
266+ point = geo ['asWKT' ][0 ]
267+ point = point .replace ('Point ' ,'' ).strip ().strip ('()' ).strip ()
268+ #print(point)
269+ coords = point .split (" " )
270+ #print(coords)
271+ result ['longitude' ] = float (coords [0 ])
272+ result ['latitude' ] = float (coords [1 ])
273+ if ('variantName' in member ):
274+ #print(member['variantName'])
275+ result ['variantNames' ] = member ['variantName' ]
276+ if ('preferredName' in member ):
277+ #print(member['preferredName'])
278+ result ['preferredName' ] = member ['preferredName' ]
279+ return result
280+ return None
281+
282+ def searchGndByNameAndGeo (locationName , latitude , longitude , maxDistance = 10 ):
283+ gndUrl = 'https://explore.gnd.network/search?term=' + locationName + '&f.satzart=Geografikum&rows=1'
284+ gndurl = 'https://lobid.org/gnd/search?q=' + locationName + '&filter=type%3APlaceOrGeographicName&format=json' #hasGeometry
285+ page = requests .get (gndurl , timeout = 60 )
286+ if page .status_code == 200 :
287+ content = page .content
288+ #print(content)
289+ if (content ):
290+ #print(content)
291+ jsonData = json .loads (content )
292+ #print(jsonData) #'variantName' !
293+ if ('member' in jsonData ):
294+ minDistance2 = 10E9
295+ result = None
296+ for member in jsonData ['member' ]:
297+ #print(25*"=*")
298+ #print(member)
299+ if ('hasGeometry' in member ):
300+ #print(member['hasGeometry'])
301+ for geo in member ['hasGeometry' ]:
302+ if ('asWKT' in geo and 'type' in geo and geo ['type' ]== 'Point' ):
303+ point = geo ['asWKT' ][0 ]
304+ point = point .replace ('Point ' ,'' ).strip ().strip ('()' ).strip ()
305+ #print(point)
306+ coords = point .split (" " )
307+ #print(coords)
308+ currLongitude = float (coords [0 ])
309+ currLatitude = float (coords [1 ])
310+ distance2 = (currLongitude - longitude )** 2 + (currLatitude - latitude )** 2
311+ #print(distance2)
312+ if (distance2 < minDistance2 ):
313+ minDistance = distance2
314+ if ('gndIdentifier' in member ):
315+ #print(member['gndIdentifier'])
316+ result = {'longitude' :currLongitude , 'latitude' :currLatitude , 'distance' :distance2 ** 0.5 }
317+ result ['gndId' ] = member ['gndIdentifier' ]
318+ if ('preferredName' in member ):
319+ #print(member['preferredName'])
320+ result ['preferredName' ] = member ['preferredName' ]
321+ #print(result)
322+ if (minDistance2 < maxDistance ** 2 ):
323+ return result
324+ return None
325+
326+ def searchGndByName (locationName ):
327+ gndUrl = 'https://explore.gnd.network/search?term=' + locationName + '&f.satzart=Geografikum&rows=1'
328+ gndurl = 'https://lobid.org/gnd/search?q=' + locationName + '&filter=type%3APlaceOrGeographicName&format=json' #hasGeometry
329+ page = requests .get (gndurl , timeout = 60 )
330+ if page .status_code == 200 :
331+ content = page .content
332+ #print(content)
333+ if (content ):
334+ #print(content)
335+ jsonData = json .loads (content )
336+ #print(jsonData) #'variantName' !
337+ if ('member' in jsonData ):
338+ for member in jsonData ['member' ]:
339+ #print(25*"=*")
340+ #print(member)
341+ if ('gndIdentifier' in member ):
342+ #print(member['gndIdentifier'])
343+ result = {'gndId' :member ['gndIdentifier' ]}
344+ if ('hasGeometry' in member ):
345+ #print(member['hasGeometry'])
346+ latitude = None
347+ longitude = None
348+ for geo in member ['hasGeometry' ]:
349+ if ('asWKT' in geo and 'type' in geo and geo ['type' ]== 'Point' ):
350+ point = geo ['asWKT' ][0 ]
351+ point = point .replace ('Point ' ,'' ).strip ().strip ('()' ).strip ()
352+ #print(point)
353+ coords = point .split (" " )
354+ #print(coords)
355+ result ['longitude' ] = float (coords [0 ])
356+ result ['latitude' ] = float (coords [1 ])
357+ found = False
358+ if ('variantName' in member ):
359+ #print(member['variantName'])
360+ result ['variantNames' ] = member ['variantName' ]
361+ found = locationName in member ['variantName' ]
362+ if ('preferredName' in member ):
363+ #print(member['preferredName'])
364+ result ['preferredName' ] = member ['preferredName' ]
365+ found = found or (member ['preferredName' ] == locationName )
366+ if (found ):
367+ return result
368+ return None
369+
141370def strangeCharacters (testString , testCharacters ):
142371 count = 0
143372 for oneCharacter in testCharacters :
@@ -161,14 +390,14 @@ def strangeCharacters(testString, testCharacters):
161390
162391 if (entity .label_ in ['LOC' ,'GPE' ]):
163392 if (entity .text in indexLocations ):
164- indexLocations [entity .text ]['count' ] += 1
393+ indexLocations [entity .text ]['count' ] += 1 #TODO add valid value...
165394 indexLocations [entity .text ]['sentiment' ] += sentence .sentiment .polarity
166395 indexLocations [entity .text ]['subjectivity' ] += sentence .sentiment .subjectivity
167396 else :
168397 indexLocations [entity .text ] = {'phrase' :entity .text , 'label' :entity .label_ , 'sentiment' :sentence .sentiment .polarity ,
169398 'subjectivity' :sentence .sentiment .subjectivity , 'language' :lang , 'count' :1 ,
170- 'geonames' :- 1 , 'geotype' :None , 'latitude' :None , 'longitude' :None ,
171- 'country' :None , 'ipcc' :None }
399+ 'gnd' : None , ' geonames' :- 1 , 'geotype' :None , 'latitude' :None , 'longitude' :None ,
400+ 'continent' : None , ' country' :None , 'ipcc' :None }
172401 if ('geonames' in oldLocationsDf .columns ):
173402 foundInOlDf = oldLocationsDf [oldLocationsDf ['phrase' ]== entity .text ]
174403 foundInOlDf = foundInOlDf [foundInOlDf ['geonames' ]> - 0.5 ]
@@ -180,6 +409,10 @@ def strangeCharacters(testString, testCharacters):
180409 indexLocations [entity .text ]['longitude' ] = float (foundInOlDf ['longitude' ].mean ())
181410 indexLocations [entity .text ]['country' ] = foundInOlDf ['country' ].min ()
182411 indexLocations [entity .text ]['ipcc' ] = foundInOlDf ['ipcc' ].min ()
412+ if ('continent' in foundInOlDf .columns ):
413+ indexLocations [entity .text ]['continent' ] = foundInOlDf ['continent' ].min ()
414+ if ('gnd' in foundInOlDf .columns ):
415+ indexLocations [entity .text ]['gnd' ] = foundInOlDf ['gnd' ].min ()
183416
184417
185418 elif (entity .label_ in ['PER' ,'PERSON' ]):
@@ -219,13 +452,15 @@ def strangeCharacters(testString, testCharacters):
219452 indexMissing [entity .text ] = {'phrase' :entity .text , 'label' :entity .label_ , 'sentiment' :sentence .sentiment .polarity ,
220453 'subjectivity' :sentence .sentiment .subjectivity , 'language' :lang , 'count' :1 }
221454
222- colSent = ['phrase' , 'label' , 'sentiment' , 'subjectivity' , 'language' , 'count' , 'geonames' , 'geotype' , 'latitude' , 'longitude' , 'country' , 'ipcc' ]
223- indexLocationsDF = pd .DataFrame .from_dict (indexLocations , orient = 'index' , columns = colSent )
455+ colGeo = ['phrase' , 'label' , 'sentiment' , 'subjectivity' , 'language' , 'count' ,
456+ 'gnd' , 'geonames' , 'geotype' , 'latitude' , 'longitude' , 'continent' , 'country' , 'ipcc' ]
457+ indexLocationsDF = pd .DataFrame .from_dict (indexLocations , orient = 'index' , columns = colGeo )
224458indexLocationsDF ['sentiment' ] = indexLocationsDF ['sentiment' ]/ indexLocationsDF ['count' ]
225459indexLocationsDF ['subjectivity' ] = indexLocationsDF ['subjectivity' ]/ indexLocationsDF ['count' ]
226460indexLocationsDF = indexLocationsDF .sort_values (by = ['count' ], ascending = False )
227- indexLocationsDF .to_csv (DATA_PATH / 'csv' / "sentiments_locations.csv" , index = True )
461+ indexLocationsDF .to_csv (DATA_PATH / 'csv' / "sentiments_locations.csv" , index = True , float_format = '%.8f' )
228462
463+ colSent = ['phrase' , 'label' , 'sentiment' , 'subjectivity' , 'language' , 'count' ]
229464indexPersonsDF = pd .DataFrame .from_dict (indexPersons , orient = 'index' , columns = colSent )
230465indexPersonsDF ['sentiment' ] = indexPersonsDF ['sentiment' ]/ indexPersonsDF ['count' ]
231466indexPersonsDF ['subjectivity' ] = indexPersonsDF ['subjectivity' ]/ indexPersonsDF ['count' ]
0 commit comments