Skip to content

Commit 347e833

Browse files
committed
fix: GH url parsing
1 parent 239bc1d commit 347e833

2 files changed

Lines changed: 136 additions & 28 deletions

File tree

cmd/test_url_parsing/main.go

Lines changed: 70 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,36 +1,90 @@
11
package main
22

33
import (
4+
"bufio"
45
"encoding/json"
56
"fmt"
67
"io"
78
"net/http"
89
"net/url"
910
"os"
11+
"regexp"
1012
"strings"
1113
"time"
1214
)
1315

14-
func main() {
15-
// Try to find the specific post about Kitten TTS
16-
searchTerm := "Kitten TTS"
16+
// Copy of the extractGitHubURL function for testing
17+
func extractGitHubURL(content string) string {
18+
if !strings.Contains(content, "github.com") {
19+
return ""
20+
}
1721

18-
// Set up API request
19-
baseURL := "https://hn.algolia.com/api/v1/search"
20-
u, err := url.Parse(baseURL)
21-
if err != nil {
22-
fmt.Printf("Error parsing URL: %v\n", err)
23-
os.Exit(1)
22+
// Process content line by line for better control
23+
r := strings.NewReader(content)
24+
scanner := bufio.NewScanner(r)
25+
26+
for scanner.Scan() {
27+
line := scanner.Text()
28+
29+
// Handle markdown links with GitHub URLs - most common in Reddit posts
30+
// Example: [repo name](https://github.com/user/repo)
31+
markdownLinkRegex := regexp.MustCompile(`\[.*?\]\((https?://)?github\.com/([^/\s]+/[^/\s]+)`)
32+
markdownMatches := markdownLinkRegex.FindStringSubmatch(line)
33+
if len(markdownMatches) > 2 {
34+
return "https://github.com/" + markdownMatches[2]
35+
}
36+
37+
// Handle markdown nested in brackets
38+
// Example: [https://github.com/user/repo](https://github.com/user/repo)
39+
nestedMarkdownRegex := regexp.MustCompile(`\[(https?://)?github\.com/([^/\s\]]+/[^/\s\]]+)\]`)
40+
nestedMatches := nestedMarkdownRegex.FindStringSubmatch(line)
41+
if len(nestedMatches) > 2 {
42+
return "https://github.com/" + nestedMatches[2]
43+
}
44+
45+
// Handle URLs with text prefixes like "months:", "Link]", etc.
46+
// Example: months: [https://github.com/user/repo](https://github.com/user/repo
47+
prefixedURLRegex := regexp.MustCompile(`(?:months:|Link[\]\)]|APK[\]\)]|GitHub:|Github[\]\)]|\⁦|https?://)?\s*(?:\[|\()?(?:https?://)?github\.com/([^/\s\]\)]+/[^/\s\]\)]+)`)
48+
prefixMatches := prefixedURLRegex.FindStringSubmatch(line)
49+
if len(prefixMatches) > 1 {
50+
// Clean the repo name from any trailing characters
51+
repoName := strings.TrimRight(prefixMatches[1], ".,;:!?)\\]\"")
52+
return "https://github.com/" + repoName
53+
}
54+
55+
// Basic GitHub URL pattern as fallback
56+
basicURLRegex := regexp.MustCompile(`(?:https?://)?github\.com/([^/\s]+/[^/\s]+)`)
57+
basicMatches := basicURLRegex.FindStringSubmatch(line)
58+
if len(basicMatches) > 1 {
59+
// Clean the repo name from any trailing characters
60+
repoName := strings.TrimRight(basicMatches[1], ".,;:!?)\\]\"")
61+
return "https://github.com/" + repoName
62+
}
2463
}
2564

26-
q := u.Query()
27-
q.Set("tags", "story,show_hn")
28-
q.Set("query", searchTerm)
29-
q.Set("hitsPerPage", "100")
30-
u.RawQuery = q.Encode()
65+
return ""
66+
}
67+
68+
func main() {
69+
// Test cases from the provided data
70+
testURLs := []string{
71+
"https://months: [https://github.com/getlilac/lilac](https://github.com/getlilac/lilac",
72+
"https://[MCPJam](https://github.com/MCPJam/inspector",
73+
"https://[https://github.com/NevaMind-AI/memU](https://github.com/NevaMind-AI/memU",
74+
"https://⁦https://github.com/clidey/dory",
75+
"https://[Leaflet](https://github.com/Leaflet/Leaflet",
76+
"https://[https://github.com/TrueTheos/Aniki](https://github.com/TrueTheos/Aniki",
77+
"https://[https://github.com/spel987/PolyUploader](https://github.com/spel987/PolyUploader/",
78+
"https://github.com/naruaika/eruo-data-studio", // A proper URL for comparison
79+
"https://GitHub: [https://github.com/timoheimonen/securememo.app](https://github.com/timoheimonen/securememo.app",
80+
"https://[https://github.com/MCPJam/inspector](https://github.com/MCPJam/inspector",
81+
"https://(https://github.com/nsarathy/coffy",
82+
"https://[https://github.com/comma-compliance](https://github.com/comma-compliance",
83+
"https://Link](http://github.com/rohankishore/Schemix/",
84+
"https://APK](https://github.com/adeeteya/Awake-AlarmApp/releases/latest/download/Awake-Android.apk",
85+
}
3186

32-
apiURL := u.String()
33-
fmt.Printf("Searching with URL: %s\n", apiURL)
87+
fmt.Println("Testing GitHub URL extraction...")
3488

3589
// Make the request
3690
client := &http.Client{Timeout: 10 * time.Second}

news/reddit.go

Lines changed: 66 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ import (
66
"net/http"
77
"net/url"
88
"os"
9+
"regexp"
910
"sort"
1011
"strings"
1112
"time"
@@ -95,28 +96,81 @@ type RedditGitHubPost struct {
9596

9697
// Extract GitHub URL from post content
9798
func extractGitHubURL(content string) string {
98-
// We'll just search for GitHub links directly in the content
99+
if !strings.Contains(content, "github.com") {
100+
return ""
101+
}
102+
103+
// Process content line by line for better control
99104
r := strings.NewReader(content)
100105
scanner := bufio.NewScanner(r)
101106

102107
for scanner.Scan() {
103108
line := scanner.Text()
104-
matches := strings.Split(line, " ")
105-
for _, match := range matches {
106-
if strings.Contains(match, "github.com") && strings.Count(match, "/") >= 2 {
107-
// Ensure it starts with http/https
108-
if !strings.HasPrefix(match, "http") {
109-
match = "https://" + match
110-
}
111-
// Clean the URL (remove trailing characters)
112-
match = strings.TrimRight(match, ".,;:!?)")
113-
return match
114-
}
109+
110+
// Handle markdown links with GitHub URLs - most common in Reddit posts
111+
// Example: [repo name](https://github.com/user/repo)
112+
markdownLinkRegex := regexp.MustCompile(`\[.*?\]\((https?://)?github\.com/([^/\s]+/[^/\s\)]+)`)
113+
markdownMatches := markdownLinkRegex.FindStringSubmatch(line)
114+
if len(markdownMatches) > 2 {
115+
return cleanGitHubURL("https://github.com/" + markdownMatches[2])
116+
}
117+
118+
// Handle markdown nested in brackets
119+
// Example: [https://github.com/user/repo](https://github.com/user/repo)
120+
nestedMarkdownRegex := regexp.MustCompile(`\[(https?://)?github\.com/([^/\s\]]+/[^/\s\]]+)\]`)
121+
nestedMatches := nestedMarkdownRegex.FindStringSubmatch(line)
122+
if len(nestedMatches) > 2 {
123+
return cleanGitHubURL("https://github.com/" + nestedMatches[2])
124+
}
125+
126+
// Handle URLs with text prefixes like "months:", "Link]", etc.
127+
// Example: months: [https://github.com/user/repo](https://github.com/user/repo
128+
prefixedURLRegex := regexp.MustCompile(`(?:months:|Link[\]\)]|APK[\]\)]|GitHub:|Github[\]\)]|https?://)?\s*(?:\[|\()?(?:https?://)?github\.com/([^/\s\]\)]+/[^/\s\]\)]+)`)
129+
prefixMatches := prefixedURLRegex.FindStringSubmatch(line)
130+
if len(prefixMatches) > 1 {
131+
return cleanGitHubURL("https://github.com/" + prefixMatches[1])
132+
}
133+
134+
// Basic GitHub URL pattern as fallback
135+
basicURLRegex := regexp.MustCompile(`(?:https?://)?github\.com/([^/\s]+/[^/\s]+)`)
136+
basicMatches := basicURLRegex.FindStringSubmatch(line)
137+
if len(basicMatches) > 1 {
138+
return cleanGitHubURL("https://github.com/" + basicMatches[1])
115139
}
116140
}
141+
117142
return ""
118143
}
119144

145+
// Helper function to clean GitHub URLs
146+
func cleanGitHubURL(url string) string {
147+
// Clean trailing characters that aren't part of repository names
148+
url = strings.TrimRight(url, ".,;:!?)\"")
149+
150+
// Extract just the owner and repo name
151+
parts := strings.Split(url, "github.com/")
152+
if len(parts) < 2 {
153+
return url
154+
}
155+
156+
repoPath := parts[1]
157+
// Split by / and take just the owner/repo part
158+
repoParts := strings.Split(repoPath, "/")
159+
if len(repoParts) < 2 {
160+
return url
161+
}
162+
163+
// Handle some special cases with trailing characters
164+
owner := repoParts[0]
165+
repo := repoParts[1]
166+
167+
// Remove trailing parenthesis or other punctuation from repo name
168+
repo = strings.TrimRight(repo, ".,;:!?)\"")
169+
170+
// Return the clean URL
171+
return "https://github.com/" + owner + "/" + repo
172+
}
173+
120174
// FetchRedditGitHubPosts fetches GitHub repos from specified subreddits from the last two weeks
121175
func FetchRedditGitHubPosts(sortBy string) ([]RedditGitHubPost, error) {
122176
token, err := getRedditToken()

0 commit comments

Comments
 (0)