From 8398a3573162bb584506a1ce0fa0c88f9db4fd91 Mon Sep 17 00:00:00 2001
From: "W. Trevor King" <wking@tremily.us>
Date: Thu, 4 Jan 2018 15:44:17 -0800
Subject: [PATCH] pull: Add JSON-LD markup

This should help with forward/backward compatibility, because
consumers can use JSON-LD to extract the semantic meaning regardless
of how I lay out the data.  From [1]:

  This information allows developers to re-use each other's data
  without having to agree to how their data will interoperate on a
  site-by-site basis.

There's a framing spec in the works [2] to support "request your own
layout" in an API.

I had to cludge a bit to support FSF IDs in licenses-full.json.  The
released JSON-LD 1.0 has index maps, but [3]:

  Note how the index keys do not appear in the Linked Data below, but
  would continue to exist if the document were compacted or
  expanded...

which isn't very useful for folks who are parsing the file as JSON-LD
and who need access to the FSF IDs.  To work around that, I've used
the FSF IDs as both the 'licenses' keys and as 'id' entries in the
'licenses' values.

We might be able to specify the semantics of the object keys with the
unreleased 1.1 [4] and its ID maps [5], but I've left that off for
now.

There is some background on classifying non-URI identifiers in [6].
Currently I'm not classifying my identifiers.

I'm using HTTPS identifiers for schema.org, because that's the
long term target [7].

I've tested the output JSON-LD in [8], and the compacted
licenses-full.json looks like:

  {
    "http://tremily.us/fsf/schema/license.jsonld": [
      {
        "@index": "ACDL",
        "https://schema.org/identifier": "ACDL",
        "https://schema.org/keywords": "libre",
        "https://schema.org/name": "Apple's Common Documentation License, Version 1.0",
        "https://schema.org/url": {
          "@list": [
            "https://www.gnu.org/licenses/license-list.html#ACDL",
            "http://fedoraproject.org/wiki/Licensing/Common_Documentation_License"
          ]
        }
      },
      {
        "@index": "AGPLv1.0",
        "https://schema.org/identifier": [
          "AGPLv1.0",
          {
            "@index": "spdx",
            "@value": "AGPL-1.0"
          }
        ],
        "https://schema.org/keywords": "libre",
        "https://schema.org/name": "Affero General Public License version 1",
        "https://schema.org/url": {
          "@list": [
            "https://www.gnu.org/licenses/license-list.html#AGPLv1.0",
            "http://directory.fsf.org/wiki/License:AGPLv1"
          ]
        }
      },
      ...
    ]
  }

[1]: https://www.w3.org/TR/2014/REC-json-ld-20140116/#h3_the-context
[2]: https://json-ld.org/spec/latest/json-ld-framing/
[3]: https://www.w3.org/TR/2014/REC-json-ld-20140116/#data-indexing
[4]: https://json-ld.org/spec/latest/json-ld/#changes-since-1-0-recommendation-of-16-january-2014
[5]: https://json-ld.org/spec/latest/json-ld/#node-identifier-indexing
[6]: http://meta.schema.org/docs/datamodel.html#identifierBg
[7]: http://schema.org/docs/faq.html#19
[8]: https://json-ld.org/playground/
---
 pull.py | 56 ++++++++++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 50 insertions(+), 6 deletions(-)

diff --git a/pull.py b/pull.py
index 995e2a7..0baab43 100755
--- a/pull.py
+++ b/pull.py
@@ -266,8 +266,9 @@ def extract(root, base_uri=None):
     return licenses
 
 
-def save(licenses, dir=os.curdir):
-    os.makedirs(dir, exist_ok=True)
+def save(licenses, base_uri, dir=os.curdir):
+    schema_dir = os.path.join(dir, 'schema')
+    os.makedirs(schema_dir, exist_ok=True)
     if sys.version_info >= (3, 5):
         paths = glob.glob(os.path.join(dir, '**', '*.json'), recursive=True)
     else:
@@ -277,17 +278,60 @@ def save(licenses, dir=os.curdir):
         )
     for path in paths:
         os.remove(path)
+    license_schema = {
+        '@context': {
+            'schema': 'https://schema.org/',
+            'id': {
+                '@id': 'schema:identifier'
+            },
+            'name': {
+                '@id': 'schema:name',
+            },
+            'uris': {
+                '@container': '@list',
+                '@id': 'schema:url',
+            },
+            'tags': {
+                '@id': 'schema:keywords',
+            },
+            'identifiers': {
+                '@container': '@index',
+                '@id': 'schema:identifier',
+            },
+        },
+    }
+    with open(os.path.join(schema_dir, 'license.jsonld'), 'w') as f:
+        json.dump(obj=license_schema, fp=f, indent=2)
+        f.write('\n')
+    license_schema_uri = urllib.parse.urljoin(
+        base=base_uri, url='schema/license.jsonld')
+    licenses_schema = license_schema.copy()
+    licenses_schema['@context']['licenses'] = {
+        '@container': '@index',
+        '@id': license_schema_uri,
+    }
+    licenses_schema.update(license_schema)
+    with open(os.path.join(schema_dir, 'licenses.jsonld'), 'w') as f:
+        json.dump(obj=licenses_schema, fp=f, indent=2, sort_keys=True)
+        f.write('\n')
+    licenses_schema_uri = urllib.parse.urljoin(
+        base=base_uri, url='schema/licenses.jsonld')
     index = sorted(licenses.keys())
     with open(os.path.join(dir, 'licenses.json'), 'w') as f:
-        json.dump(obj=index, fp=f, indent=2)
+        json.dump(obj=index, fp=f, indent=2, sort_keys=True)
         f.write('\n')
-    full_index = {}
+    full_index = {
+        '@context': licenses_schema_uri,
+        'licenses': {},
+    }
     for id, license in licenses.items():
         license = license.copy()
         if 'tags' in license:
             license['tags'] = sorted(license['tags'])
-        full_index[id] = license.copy()
         license['id'] = id
+        full_index['licenses'][id] = license.copy()
+        license['@context'] = urllib.parse.urljoin(
+            base=base_uri, url='schema/license.jsonld')
         license_path = os.path.join(dir, '{}.json'.format(id))
         with open(license_path, 'w') as f:
             json.dump(obj=license, fp=f, indent=2, sort_keys=True)
@@ -317,4 +361,4 @@ def save(licenses, dir=os.curdir):
     if unused_identifiers:
         raise ValueError('unused IDENTIFIERS keys: {}'.format(
             ', '.join(sorted(unused_identifiers))))
-    save(licenses=licenses, dir=dir)
+    save(licenses=licenses, base_uri='https://wking.github.io/fsf-api/', dir=dir)