Skip to content

Commit 469d62d

Browse files
authored
test: Pytest with PySpark (#15)
* test: Pytest * ci: renamed workflow * ci: release JAR * ci: YAML formatting
1 parent e027c1a commit 469d62d

12 files changed

+1629
-72
lines changed

.github/workflows/release.yml

+53-53
Original file line numberDiff line numberDiff line change
@@ -7,56 +7,56 @@ jobs:
77
release:
88
runs-on: ubuntu-latest
99
steps:
10-
- name: Checkout
11-
uses: actions/checkout@v3
12-
with:
13-
fetch-depth: '0'
14-
15-
- name: Get Author Name and Email
16-
run: |
17-
AUTHOR_NAME=$(git log -1 --pretty=format:%an ${{ github.sha }})
18-
AUTHOR_EMAIL=$(git log -1 --pretty=format:%ae ${{ github.sha }})
19-
echo "AUTHOR_NAME=$AUTHOR_NAME" >> $GITHUB_OUTPUT
20-
echo "AUTHOR_EMAIL=$AUTHOR_EMAIL" >> $GITHUB_OUTPUT
21-
id: author_info
22-
23-
- name: Set up Java 8
24-
uses: actions/setup-java@v3
25-
with:
26-
java-version: "8"
27-
distribution: temurin
28-
server-id: ossrh
29-
server-username: OSSRH_JIRA_USERNAME
30-
server-password: OSSRH_JIRA_PASSWORD
31-
gpg-private-key: ${{ secrets.GPG_PRIVATE_KEY }}
32-
gpg-passphrase: GPG_PASSPHRASE
33-
34-
- name: Cache local Maven repository
35-
uses: actions/cache@v3
36-
with:
37-
path: ~/.m2/repository
38-
key: ${{ runner.os }}-maven-${{ hashFiles('**/pom.xml') }}
39-
restore-keys: |
40-
${{ runner.os }}-maven-
41-
42-
- uses: actions/setup-node@v4
43-
with:
44-
node-version: 20
45-
46-
- name: "🔧 setup Bun"
47-
uses: oven-sh/setup-bun@v1
48-
49-
- name: Semantic Release
50-
run: |
51-
bun install @conveyal/maven-semantic-release semantic-release @semantic-release/git conventional-changelog-conventionalcommits
52-
bun x semantic-release --prepare @conveyal/maven-semantic-release --publish @semantic-release/github,@conveyal/maven-semantic-release --verify-conditions @semantic-release/github,@conveyal/maven-semantic-release --verify-release @conveyal/maven-semantic-release
53-
env:
54-
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
55-
GPG_KEY_NAME: ${{ secrets.GPG_KEY_NAME }}
56-
GPG_PASSPHRASE: ${{ secrets.GPG_PASSPHRASE }}
57-
OSSRH_JIRA_USERNAME: ${{ secrets.OSSRH_JIRA_USERNAME }}
58-
OSSRH_JIRA_PASSWORD: ${{ secrets.OSSRH_JIRA_PASSWORD }}
59-
GIT_COMMITTER_NAME: "github-actions[bot]"
60-
GIT_COMMITTER_EMAIL: "41898282+github-actions[bot]@users.noreply.github.com"
61-
GIT_AUTHOR_NAME: ${{ steps.author_info.outputs.AUTHOR_NAME }}
62-
GIT_AUTHOR_EMAIL: ${{ steps.author_info.outputs.AUTHOR_EMAIL }}
10+
- name: Checkout
11+
uses: actions/checkout@v3
12+
with:
13+
fetch-depth: '0'
14+
15+
- name: Get Author Name and Email
16+
run: |
17+
AUTHOR_NAME=$(git log -1 --pretty=format:%an ${{ github.sha }})
18+
AUTHOR_EMAIL=$(git log -1 --pretty=format:%ae ${{ github.sha }})
19+
echo "AUTHOR_NAME=$AUTHOR_NAME" >> $GITHUB_OUTPUT
20+
echo "AUTHOR_EMAIL=$AUTHOR_EMAIL" >> $GITHUB_OUTPUT
21+
id: author_info
22+
23+
- name: Set up Java 8
24+
uses: actions/setup-java@v3
25+
with:
26+
java-version: "8"
27+
distribution: temurin
28+
server-id: ossrh
29+
server-username: OSSRH_JIRA_USERNAME
30+
server-password: OSSRH_JIRA_PASSWORD
31+
gpg-private-key: ${{ secrets.GPG_PRIVATE_KEY }}
32+
gpg-passphrase: GPG_PASSPHRASE
33+
34+
- name: Cache local Maven repository
35+
uses: actions/cache@v3
36+
with:
37+
path: ~/.m2/repository
38+
key: ${{ runner.os }}-maven-${{ hashFiles('**/pom.xml') }}
39+
restore-keys: |
40+
${{ runner.os }}-maven-
41+
42+
- uses: actions/setup-node@v4
43+
with:
44+
node-version: 20
45+
46+
- name: "🔧 setup Bun"
47+
uses: oven-sh/setup-bun@v1
48+
49+
- name: Semantic Release
50+
run: |
51+
bun install @conveyal/maven-semantic-release semantic-release @semantic-release/git conventional-changelog-conventionalcommits
52+
bun x semantic-release --prepare @conveyal/maven-semantic-release --publish @semantic-release/github,@conveyal/maven-semantic-release --verify-conditions @semantic-release/github,@conveyal/maven-semantic-release --verify-release @conveyal/maven-semantic-release
53+
env:
54+
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
55+
GPG_KEY_NAME: ${{ secrets.GPG_KEY_NAME }}
56+
GPG_PASSPHRASE: ${{ secrets.GPG_PASSPHRASE }}
57+
OSSRH_JIRA_USERNAME: ${{ secrets.OSSRH_JIRA_USERNAME }}
58+
OSSRH_JIRA_PASSWORD: ${{ secrets.OSSRH_JIRA_PASSWORD }}
59+
GIT_COMMITTER_NAME: "github-actions[bot]"
60+
GIT_COMMITTER_EMAIL: "41898282+github-actions[bot]@users.noreply.github.com"
61+
GIT_AUTHOR_NAME: ${{ steps.author_info.outputs.AUTHOR_NAME }}
62+
GIT_AUTHOR_EMAIL: ${{ steps.author_info.outputs.AUTHOR_EMAIL }}

.github/workflows/test.yml

+29-14
Original file line numberDiff line numberDiff line change
@@ -1,24 +1,39 @@
1-
name: Maven Tests
1+
name: Maven and Python Tests
22
"on":
33
pull_request:
44
types:
5-
- opened
6-
- edited
7-
- synchronize
8-
- reopened
5+
- opened
6+
- edited
7+
- synchronize
8+
- reopened
99
env:
1010
QDRANT_URL: "${{ secrets.QDRANT_URL }}"
1111
QDRANT_API_KEY: "${{ secrets.QDRANT_API_KEY }}"
1212
jobs:
1313
test:
1414
runs-on: ubuntu-latest
1515
steps:
16-
- uses: actions/checkout@v4
17-
- uses: actions/setup-java@v3
18-
with:
19-
java-version: "8"
20-
distribution: temurin
21-
- name: Run the Maven tests
22-
run: mvn test
23-
- name: Generate assembly fat JAR
24-
run: mvn clean package
16+
- uses: actions/checkout@v4
17+
- uses: actions/setup-java@v3
18+
with:
19+
java-version: "8"
20+
distribution: temurin
21+
- name: Cache local Maven repository
22+
uses: actions/cache@v3
23+
with:
24+
path: ~/.m2/repository
25+
key: ${{ runner.os }}-maven-${{ hashFiles('**/pom.xml') }}
26+
restore-keys: |
27+
${{ runner.os }}-maven-
28+
- name: Run Maven tests
29+
run: mvn test
30+
- name: Generate JARs
31+
run: mvn clean package -DskipTests
32+
- uses: actions/setup-python@v4
33+
with:
34+
python-version: '3.11'
35+
cache: 'pip'
36+
- name: Install Python test dependencies
37+
run: pip install -r src/test/python/requirements.txt
38+
- name: Run Python tests
39+
run: pytest

.github/workflows/upload-binaries.yml

+34
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
name: Build and release JAR files
2+
3+
on:
4+
release:
5+
types:
6+
- published
7+
8+
jobs:
9+
upload-jar:
10+
runs-on: ubuntu-latest
11+
permissions:
12+
contents: write # release changes require contents write
13+
14+
steps:
15+
- name: Check out code
16+
uses: actions/checkout@v4
17+
18+
- uses: actions/setup-java@v3
19+
with:
20+
java-version: "8"
21+
distribution: temurin
22+
23+
- name: Generate JARs
24+
run: mvn clean package -DskipTests
25+
26+
- name: Set project version env variable
27+
run: |
28+
echo "PROJECT_VERSION=$(mvn help:evaluate -Dexpression=project.version -q -DforceStdout)" >> $GITHUB_ENV
29+
30+
- name: Build and upload JAR
31+
env:
32+
GH_TOKEN: ${{ github.token }}
33+
run: |
34+
gh release upload ${{ github.event.release.tag_name }} target/spark-${{ env.PROJECT_VERSION }}.jar

.gitignore

+7-1
Original file line numberDiff line numberDiff line change
@@ -16,4 +16,10 @@ buildNumber.properties
1616
# JDT-specific (Eclipse Java Development Tools)
1717
.classpath
1818
.vscode/
19-
.DS_Store
19+
.DS_Store
20+
21+
poetry.lock
22+
.pytest_cache/
23+
*_pycache__
24+
25+
senv

README.md

+3-3
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ For use with Java and Scala projects, the package can be found [here](https://ce
3030
<dependency>
3131
<groupId>io.qdrant</groupId>
3232
<artifactId>spark</artifactId>
33-
<version>2.0</version>
33+
<version>2.0.1</version>
3434
</dependency>
3535
```
3636

@@ -43,7 +43,7 @@ from pyspark.sql import SparkSession
4343

4444
spark = SparkSession.builder.config(
4545
"spark.jars",
46-
"spark-2.0.jar", # specify the downloaded JAR file
46+
"spark-2.0.1.jar", # specify the downloaded JAR file
4747
)
4848
.master("local[*]")
4949
.appName("qdrant")
@@ -75,7 +75,7 @@ You can use the `qdrant-spark` connector as a library in Databricks to ingest da
7575

7676
- Go to the `Libraries` section in your cluster dashboard.
7777
- Select `Install New` to open the library installation modal.
78-
- Search for `io.qdrant:spark:2.0` in the Maven packages and click `Install`.
78+
- Search for `io.qdrant:spark:2.0.1` in the Maven packages and click `Install`.
7979

8080
<img width="1064" alt="Screenshot 2024-01-05 at 17 20 01 (1)" src="https://github.com/qdrant/qdrant-spark/assets/46051506/d95773e0-c5c6-4ff2-bf50-8055bb08fd1b">
8181

pom.xml

+1-1
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
<modelVersion>4.0.0</modelVersion>
77
<groupId>io.qdrant</groupId>
88
<artifactId>spark</artifactId>
9-
<version>2.0.0</version>
9+
<version>2.0.1</version>
1010
<name>qdrant-spark</name>
1111
<url>https://github.com/qdrant/qdrant-spark</url>
1212
<description>An Apache Spark connector for the Qdrant vector database</description>

src/main/java/io/qdrant/spark/QdrantValueFactory.java

+5
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,11 @@ public class QdrantValueFactory {
2121
private QdrantValueFactory() {}
2222

2323
public static Value value(InternalRow record, StructField field, int fieldIndex) {
24+
25+
if (record.isNullAt(fieldIndex)) {
26+
return nullValue();
27+
}
28+
2429
DataType dataType = field.dataType();
2530

2631
switch (dataType.typeName()) {

src/test/python/__init__.py

Whitespace-only changes.

src/test/python/conftest.py

+96
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,96 @@
1+
import pytest
2+
from testcontainers.core.container import DockerContainer # type: ignore
3+
from testcontainers.core.waiting_utils import wait_for_logs # type: ignore
4+
from qdrant_client import QdrantClient, models
5+
import uuid
6+
from pyspark.sql import SparkSession
7+
from typing import NamedTuple
8+
9+
10+
QDRANT_GRPC_PORT = 6334
11+
QDRANT_EMBEDDING_DIM = 6
12+
QDRANT_DISTANCE = models.Distance.COSINE
13+
14+
15+
class Qdrant(NamedTuple):
16+
url: str
17+
collection_name: str
18+
client: QdrantClient
19+
20+
21+
qdrant_container = DockerContainer("qdrant/qdrant").with_exposed_ports(QDRANT_GRPC_PORT)
22+
23+
24+
# Reference: https://gist.github.com/dizzythinks/f3bb37fd8ab1484bfec79d39ad8a92d3
25+
def get_pom_version():
26+
from xml.etree import ElementTree as et
27+
28+
ns = "http://maven.apache.org/POM/4.0.0"
29+
et.register_namespace("", ns)
30+
tree = et.ElementTree()
31+
tree.parse("pom.xml")
32+
p = tree.getroot().find("{%s}version" % ns)
33+
return p.text
34+
35+
36+
@pytest.fixture(scope="module", autouse=True)
37+
def setup_container(request):
38+
qdrant_container.start()
39+
wait_for_logs(
40+
qdrant_container, ".*Actix runtime found; starting in Actix runtime.*", 60
41+
)
42+
43+
def remove_container():
44+
qdrant_container.stop()
45+
46+
request.addfinalizer(remove_container)
47+
48+
49+
@pytest.fixture(scope="session")
50+
def spark_session():
51+
spark_session = (
52+
SparkSession.builder.config(
53+
"spark.jars", f"target/spark-{get_pom_version()}.jar"
54+
)
55+
.master("local[*]")
56+
.appName("qdrant")
57+
.getOrCreate()
58+
)
59+
60+
return spark_session
61+
62+
63+
@pytest.fixture()
64+
def qdrant() -> Qdrant:
65+
host = qdrant_container.get_container_host_ip()
66+
grpc_port = qdrant_container.get_exposed_port(QDRANT_GRPC_PORT)
67+
68+
client = QdrantClient(
69+
host=host,
70+
grpc_port=grpc_port,
71+
prefer_grpc=True,
72+
)
73+
74+
collection_name = str(uuid.uuid4())
75+
client.create_collection(
76+
collection_name=collection_name,
77+
vectors_config={
78+
"dense": models.VectorParams(
79+
size=QDRANT_EMBEDDING_DIM,
80+
distance=QDRANT_DISTANCE,
81+
),
82+
"": models.VectorParams(
83+
size=QDRANT_EMBEDDING_DIM,
84+
distance=QDRANT_DISTANCE,
85+
),
86+
},
87+
sparse_vectors_config={
88+
"sparse": models.SparseVectorParams(),
89+
},
90+
)
91+
92+
return Qdrant(
93+
url=f"http://{host}:{grpc_port}",
94+
client=client,
95+
collection_name=collection_name,
96+
)

src/test/python/requirements.txt

+4
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
pyspark==3.5.1
2+
pytest==8.0.2
3+
qdrant-client==1.7.3
4+
testcontainers==3.7.1

0 commit comments

Comments
 (0)