Upate docs to include field name descriptions (#25)

willbeason · web-flow · commit 44171fd0e81d · 2025-02-13T11:29:56.000-06:00
* Add more docs on table definitions

WIP

Signed-off-by: Will Beason &lt;willbeason@gmail.com&gt;

* Remove stale table definitions

These were old definitions for the first iteration.

Signed-off-by: Will Beason &lt;willbeason@gmail.com&gt;

* Add explanation of every field in each table.

Add appendices for dictionary types with small dictionaries.

Signed-off-by: Will Beason &lt;willbeason@gmail.com&gt;

* Move table documentation to its own file.

Signed-off-by: Will Beason &lt;willbeason@gmail.com&gt;

* Update golangci-lint

Signed-off-by: Will Beason &lt;willbeason@gmail.com&gt;

* Update to Go 1.24

Causing CI failures

Signed-off-by: Will Beason &lt;willbeason@gmail.com&gt;

* Document 'gg' pattern

---------

Signed-off-by: Will Beason &lt;willbeason@gmail.com&gt;
diff --git a/.github/workflows/golangci-lint.yml b/.github/workflows/golangci-lint.yml
@@ -21,4 +21,4 @@ jobs:
       - name: golangci-lint
         uses: golangci/golangci-lint-action@v6
         with:
-          version: v1.63.4
+          version: v1.64.3
diff --git a/EXTRACTING_TABLES.md b/EXTRACTING_TABLES.md
@@ -108,11 +108,16 @@ These are currently a work in progress.
 
 ### Extracting Tables
 
-To extract tables, run `extract-columns` on the directory containing the JSONL files.
+To extract tables, run `extract-columns`, passing both the IN_DIR containing the JSONL files and the out directory to write tables to.
+
+To fully extract tables, you need to run the three commands in order. That the first and third command are identical is not a mistake. The full process will take approximately one hour and will use about 10GiB of memory while running.
 
 ```shell
-go run cmd/extract-columns/extract-columns.go [papers|software] IN_DIR OUT_DIR
+go run cmd/extract-columns/extract-columns.go papers IN_DIR OUT_DIR
+go run cmd/extract-columns/extract-columns.go pdf IN_DIR OUT_DIR
+go run cmd/extract-columns/extract-columns.go papers IN_DIR OUT_DIR
 ```
 
-For now there are only two tables, `papers` and `software`.
-You may define new Parquet table definitions that extract information from the JSONL files, but you must insert the reference in extract-columns.go to use them.
+The reason for this is a circular dependency between the datasets, which can only be resolved by iterating over at least one of the datasets twice:
+1. Papers has a "has_mentions" field, which requires knowledge from the Mentions table of whether any mentions exist for a paper.
+2. Mentions has a "paper_id" field, which is computed as part of extracting the Papers table.
diff --git a/cmd/extract-columns/extract-column.go b/cmd/extract-columns/extract-column.go
@@ -106,6 +106,8 @@ func runE(_ *cobra.Command, args []string) error {
 	}
 }
 
+// The "gg" here is for a special file for two papers which were missing metadata
+// in the original SoftCite dataset.
 var (
 	paperPattern   = regexp.MustCompile(`([0-9a-f]{2}|gg)\.jsonl.gz`)
 	pdfPattern     = regexp.MustCompile(`[0-9a-f]{2}\.software\.jsonl\.gz`)
diff --git a/go.mod b/go.mod
@@ -1,6 +1,6 @@
 module github.com/willbeason/software-mentions
 
-go 1.23.1
+go 1.24
 
 require (
 	github.com/VividCortex/ewma v1.2.0
diff --git a/pkg/tables/papers.go b/pkg/tables/papers.go
@@ -43,16 +43,16 @@ var Papers = arrow.NewSchema([]arrow.Field{
 		).Build(),
 		Nullable: true,
 	},
-	{Name: "journal_name",
+	{Name: "publication_venue",
 		Type: arrow.BinaryTypes.String,
 		Metadata: NewMetadataBuilder().Add(
-			comment, "The parsed journal the paper was published in",
+			comment, "The parsed venue the paper was published in",
 		).Build(),
 	},
 	{Name: "publisher_name",
 		Type: arrow.BinaryTypes.String,
 		Metadata: NewMetadataBuilder().Add(
-			comment, "The parsed publisher of the paper's journal",
+			comment, "The parsed publisher of the paper's venue",
 		).Build(),
 		Nullable: true,
 	},
@@ -72,7 +72,7 @@ var Papers = arrow.NewSchema([]arrow.Field{
 	{Name: "pmid",
 		Type: arrow.BinaryTypes.String,
 		Metadata: NewMetadataBuilder().Add(
-			comment, "The PubMed Identifier of the paper",
+			comment, "The PubMed identifier of the paper",
 		).Build(),
 		Nullable: true,
 	},
@@ -82,6 +82,9 @@ var Papers = arrow.NewSchema([]arrow.Field{
 			ValueType: arrow.BinaryTypes.String,
 			Ordered:   false,
 		},
+		Metadata: NewMetadataBuilder().Add(
+			comment, "The type of document the paper is, such as a journal article or a book",
+		).Build(),
 		Nullable: true,
 	},
 	{Name: "license_type",
diff --git a/pkg/tables/tables.go b/pkg/tables/tables.go
@@ -1,62 +1,6 @@
 package tables
 
-import "github.com/apache/arrow/go/v18/arrow"
-
 const (
-	Software = "software"
-
+	Software   = "software"
 	ParquetExt = ".parquet"
 )
-
-var (
-	GrobidRunSchema = arrow.NewSchema([]arrow.Field{
-		{Name: "uuid", Type: arrow.BinaryTypes.String},
-		{Name: "application", Type: &arrow.DictionaryType{
-			IndexType: arrow.PrimitiveTypes.Uint8,
-			ValueType: arrow.BinaryTypes.String,
-			Ordered:   false,
-		}},
-		{Name: "date", Type: arrow.BinaryTypes.String},
-		{Name: "file", Type: arrow.BinaryTypes.String},
-		{Name: "softcite_file_name", Type: arrow.BinaryTypes.String},
-		{Name: "id", Type: arrow.BinaryTypes.String},
-		{Name: "md5", Type: arrow.BinaryTypes.String},
-		{Name: "metadata.id", Type: arrow.BinaryTypes.String},
-		{Name: "original_file_path", Type: arrow.BinaryTypes.String},
-		{Name: "runtime", Type: arrow.PrimitiveTypes.Uint32},
-		{Name: "version", Type: &arrow.DictionaryType{
-			IndexType: arrow.PrimitiveTypes.Uint8,
-			ValueType: arrow.BinaryTypes.String,
-			Ordered:   false,
-		}},
-	}, nil)
-
-	PapersSchema = arrow.NewSchema([]arrow.Field{
-		{Name: "uuid", Type: arrow.BinaryTypes.String},
-		{Name: "doi", Type: arrow.BinaryTypes.String},
-		{Name: "year", Type: arrow.PrimitiveTypes.Uint16},
-	}, nil)
-
-	SoftwareSchema = arrow.NewSchema([]arrow.Field{
-		{Name: "normalizedForm", Type: arrow.BinaryTypes.String},
-		{Name: "wikidataId", Type: arrow.BinaryTypes.String},
-		//{Name: "softwareType", Type: &arrow.DictionaryType{
-		//	IndexType: arrow.PrimitiveTypes.Uint8,
-		//	ValueType: arrow.BinaryTypes.String,
-		//	Ordered:   false,
-		//},
-		//},
-	}, nil)
-
-	MentionsSchema = arrow.NewSchema([]arrow.Field{
-		{Name: "paperId", Type: arrow.BinaryTypes.String},
-		{Name: "mentionIndex", Type: arrow.PrimitiveTypes.Uint16},
-		{Name: "normalizedForm", Type: arrow.BinaryTypes.String},
-		{Name: "documentContextAttributes.created.value", Type: arrow.FixedWidthTypes.Boolean},
-		{Name: "documentContextAttributes.shared.value", Type: arrow.FixedWidthTypes.Boolean},
-		{Name: "documentContextAttributes.used.value", Type: arrow.FixedWidthTypes.Boolean},
-		{Name: "mentionContextAttributes.created.value", Type: arrow.FixedWidthTypes.Boolean},
-		{Name: "mentionContextAttributes.shared.value", Type: arrow.FixedWidthTypes.Boolean},
-		{Name: "mentionContextAttributes.used.value", Type: arrow.FixedWidthTypes.Boolean},
-	}, nil)
-)
diff --git a/tables.md b/tables.md
@@ -0,0 +1,109 @@
+# Tables
+
+This document lists every table in 
+
+## Table Definitions
+
+The Parquet files are three tables of the SoftCite data.
+They do not contain all fields in the SoftCite dataset, but are a (hopefully useful) subset specifically related to mentions.
+
+Much of the information below can be gleaned from the metadata field `comment`, which is present in every table and for every field.
+Where this documentation conflicts with what is in `comment`, trust what is in `comment`.
+
+Where field names are repeated between tables, they have identical meaning (e.g. "paper_id").
+
+Tables are mostly normalized, but with several technically-redundant precalculated fields (such as "published_year" from "published_date") which have been added for convenience.
+
+### Papers
+
+This table contains paper metadata.
+Each entry represents a single paper analyzed by SoftCite.
+Many papers do not have any associated mentions - see the `has_mentions` field.
+
+- **paper_id** is a unique key for each paper, specific to this dataset.
+- **softcite_id** is the UUID for each paper in the original SoftCite dataset.
+- **title** is the title of the paper as parsed by SoftCite.
+- **published_year** is the year the paper was published, calculated from published_date.
+- **published_date** is the publication date of the paper as parsed by SoftCite.
+- **publication_venue** is the venue the paper was published in. This covers
+- **publisher_name** is the publisher of the paper's venue.
+- **doi** is the raw DOI of the paper (non-URL form).
+- **pmcid** is the PubMed Central identifier for the paper, if one exists.
+- **pmid** is the PubMed identifier of the paper, if one exists.
+- **genre** is the type of document the paper is, such as a journal article or a book. The full list of genres is shown [below](#genres).
+- **license_type*** is the license of the document parsed by SoftCite. The full list of licenses is shown [below](#licenses).
+- **has_mentions** is whether SoftCite identified any software mentions for the paper.
+
+### Mentions
+
+This table contains an entry for every identified mention of a piece of software in the analyzed papers.
+
+- **software_mention_id** is a unique key for each software mention. It is a composite of _paper_id_, _source_file_type_, and _mention_index_.
+- **paper_id** is the equivalent to _paper_id_ in the Papers table.
+- **source_file_type** is the format of the document parsed by SoftCite. For now this is always "pdf", but in the future may include other formats.
+- **mention_index** is a unique key for each mention within a paper.
+- **software_raw** is the raw string of the mentioned software.
+- **software_normalized** is a normalized form of _software_raw_.
+- **version_raw** is the version of the mentioned software, if present in the mention.
+- **version_normalized** is a normalized form of _version_raw_.
+- **publisher_raw** is the raw string of the publisher of the mentioned software, if present in the mention.
+- **publisher_normalized** is a normalized form of _publisher_raw_.
+- **language_raw** is the raw string of the mentioned software's programming language, if present in the  mention.
+- **language_normalized** is a normalized form of _language_raw_.
+- **url_raw** is the raw string of the URL for the mentioned software, if present in the mention.
+- **url_normalized** is a normalized form of _url_raw_.
+- **context_full_text** is the surrounding context of the software mention in the paper, as parsed by SoftCite. This is often a sentence, but can be a fragment.
+
+### PurposeAssessments
+
+Each mention has Purpose Assessments which try to determine whether the mention has a given purpose.
+Each Mention in the Mentions table has exactly six of these assessments, one for each possible combination of scope and purpose (see below).
+
+- **software_mention_id** is identical to _software_mention_id in the Mentions table.
+- **paper_id** is identical to _paper_id_ in the Papers table.
+- **source_file_type** is identical to _source_file_type_ in the Mentions table.
+- **mention_index** is identical to _mention_index_ in the Mentions table.
+- **scope** is either "document" or "local". A "local" scope indicates the analysis was done specifically on the local context of the mention when determining its purpose. A "document" scope indicates that the analysis covered the entire document.
+- **purpose** is either "created", "used", and "shared", representing the reason the software was mentioned in this context. These purposes are not necessarily distinct: a mention could both indicate that some software was created by the papers' authors and is available on GitHub, for instance, making it both "created" and "shared".
+
+### Appendix
+
+#### Genres
+
+For reference, these are the known values for the "genre" field:
+
+- "book"
+- "book-chapter"
+- "book-part"
+- "book-section"
+- "book-series"
+- "book-set"
+- "database"
+- "dataset"
+- "dissertation"
+- "edited-book"
+- "grant"
+- "journal"
+- "journal-article"
+- "journal-issue"
+- "journal-volume"
+- "monograph"
+- "other"
+- "peer-review"
+- "posted-content"
+- "proceedings"
+- "proceedings-article"
+- "proceedings-series"
+- "reference-book"
+- "reference-entry"
+- "report"
+- "report-component"
+- "report-series"
+- "standard"
+- NA (not present)
+
+#### Licenses
+
+For reference, these are the known value for the "license" field:
+
+TBD.

Original file line number	Diff line number	Diff line change
`@@ -106,6 +106,8 @@ func runE(_ *cobra.Command, args []string) error {`
`106`	`106`	`}`
`107`	`107`	`}`
`108`	`108`
	`109`	`+// The "gg" here is for a special file for two papers which were missing metadata`
	`110`	`+// in the original SoftCite dataset.`
`109`	`111`	`var (`
`110`	`112`	paperPattern = regexp.MustCompile(`([0-9a-f]{2}\|gg)\.jsonl.gz`)
`111`	`113`	pdfPattern = regexp.MustCompile(`[0-9a-f]{2}\.software\.jsonl\.gz`)