Notice

I have created a more accessible article explaining the workflow introduced in this article. Please also refer to the following.

Overview

I would like to introduce a prototype tool for creating annotated IIIF manifest files and TEI/XML files using NDL Klasseki OCR-Lite.

Creating Annotated IIIF Manifest Files

First, I created a Gradio app that takes an IIIF manifest file as input and outputs an annotated IIIF manifest file using NDL Klasseki OCR-Lite. It is published using Hugging Face Spaces.

As output, you get an annotated IIIF manifest file like the following.

{
    "@context": "http://iiif.io/api/presentation/3/context.json",
    "id": "https://dl.ndl.go.jp/api/iiif/3437686/manifest.json",
    "type": "Manifest",
    "label": {
      "none": [
        "ę ”ē•°ęŗę°ē‰©čŖž. 巻一"
      ]
    },
    "items": [
      {
        "id": "https://dl.ndl.go.jp/api/iiif/3437686/canvas/1",
        "type": "Canvas",
        "width": 6890,
        "height": 4706,
        "label": {
          "none": [
            "1"
          ]
        },
        "items": [
          {
            "id": "https://dl.ndl.go.jp/api/iiif/3437686/canvas/1/page",
            "type": "AnnotationPage",
            "items": [
              {
                "id": "https://dl.ndl.go.jp/api/iiif/3437686/canvas/1/page/imageanno",
                "type": "Annotation",
                "motivation": "sc:painting",
                "target": "https://dl.ndl.go.jp/api/iiif/3437686/canvas/1",
                "body": {
                  "id": "https://dl.ndl.go.jp/api/iiif/3437686/R0000001/full/full/0/default.jpg",
                  "type": "Image",
                  "format": "image/jpeg",
                  "width": 6890,
                  "height": 4706,
                  "service": [
                    {
                      "id": "https://dl.ndl.go.jp/api/iiif/3437686/R0000001",
                      "type": "ImageService2",
                      "profile": "level2"
                    }
                  ]
                }
              }
            ]
          }
        ],
        "annotations": [
          {
            "id": "https://dl.ndl.go.jp/api/iiif/3437686/canvas/1/annos",
            "type": "AnnotationPage",
            "items": [
              {
                "id": "https://dl.ndl.go.jp/api/iiif/3437686/canvas/1/annos/0",
                "type": "Annotation",
                "motivation": "commenting",
                "target": "https://dl.ndl.go.jp/api/iiif/3437686/canvas/1#xywh=5270,275,114,935",
                "body": {
                  "type": "TextualBody",
                  "value": "äø€ćƒ»ć€‡ćƒ»ćƒ»ćƒ»ćƒ»ćƒ»ćƒ»äø€äø€äø€äø€ćƒ»ćƒ»ćƒ»ćƒ»ćƒ»ćƒ»ćƒ»ćƒ»ćƒ»ćƒ»ćƒ»ćƒ»ćƒ»ćƒ»ćƒ»ćƒ»ćƒ»ćƒ»ćƒ»ćƒ»ćƒ»ćƒ»ćƒ»ćƒ»ćƒ»ćƒ»ćƒ»ćƒ»ćƒ»ćƒ»ćƒ»ćƒ»ćƒ»ćƒ»ćƒ»ćƒ»ćƒ»ćƒ»ćƒ»ćƒ»ćƒ»ćƒ»ćƒ»ćƒ»ćƒ»ćƒ»ćƒ»ćƒ»ćƒ»ćƒ»ćƒ»ćƒ»ćƒ»ćƒ»ćƒ»ćƒ»ćƒ»"
                }
              },
              {
                "id": "https://dl.ndl.go.jp/api/iiif/3437686/canvas/1/annos/1",
                "type": "Annotation",
                "motivation": "commenting",
                "target": "https://dl.ndl.go.jp/api/iiif/3437686/canvas/1#xywh=5293,2009,218,424",
                "body": {
                  "type": "TextualBody",
                  "value": "○〇"
                }
              },
              {
                "id": "https://dl.ndl.go.jp/api/iiif/3437686/canvas/1/annos/2",
                "type": "Annotation",
                "motivation": "commenting",
                "target": "https://dl.ndl.go.jp/api/iiif/3437686/canvas/1#xywh=5092,3272,63,80",
                "body": {
                  "type": "TextualBody",
                  "value": "一一"
                }
              },
              {
                "id": "https://dl.ndl.go.jp/api/iiif/3437686/canvas/1/annos/3",
                "type": "Annotation",
                "motivation": "commenting",
                "target": "https://dl.ndl.go.jp/api/iiif/3437686/canvas/1#xywh=4375,304,103,1475",
                "body": {
                  "type": "TextualBody",
                  "value": "ć‚¹ć€‡ć€‡ć€‡å…­ć€‡ć€‡ć€‡äø€ć€‡ć€‡ć€‡ć€‡ć€‡ć€‡ć€‡äø€äø€äø€ć€‡ć€‡ć€‡äø€äø€äø€äø€ć€‡ć€‡ć€‡ć€‡ć€‡ć€‡ć€‡ć€‡ć€‡ć€‡äø€äø€ćƒ»ć€‡ć€‡ćƒ»ćƒ»ćƒ»ćƒ»ćƒ»ćƒ»ćƒ»ć®ć€‡ć€‡ćƒ»ćƒ»ćƒ»ćƒ»äø€ćƒ»ćƒ»ćƒ»"
                }
              },
              {
                "id": "https://dl.ndl.go.jp/api/iiif/3437686/canvas/1/annos/4",
                "type": "Annotation",
                "motivation": "commenting",
                "target": "https://dl.ndl.go.jp/api/iiif/3437686/canvas/1#xywh=4375,2853,45,522",
                "body": {
                  "type": "TextualBody",
                  "value": "ā–”ē‰ēƒā–”ā–”ā–”ā–”ā–”ā–”ā–”ā–”ā–”ā–”ā–”ā–”ā–”ā–”ā–”ā–”ā–”"
                }
              },
              {
                "id": "https://dl.ndl.go.jp/api/iiif/3437686/canvas/1/annos/5",
                "type": "Annotation",
                "motivation": "commenting",
                "target": "https://dl.ndl.go.jp/api/iiif/3437686/canvas/1#xywh=4283,2756,63,252",
                "body": {
                  "type": "TextualBody",
                  "value": "〇〇〇〇〇〇〇〇〇〇〇〇〇〇〇〇〇〇〇〇〇〇〇〇〇一〇〇一〇〇〇"
                }
              },
              {
                "id": "https://dl.ndl.go.jp/api/iiif/3437686/canvas/1/annos/6",
                "type": "Annotation",
                "motivation": "commenting",
                "target": "https://dl.ndl.go.jp/api/iiif/3437686/canvas/1#xywh=694,499,310,2991",
                "body": {
                  "type": "TextualBody",
                  "value": "åŒę ”ē•°ęŗę°ē‰©å·»äø€"
                }
              }
            ]
          }
        ]
      },
      {
        "id": "https://dl.ndl.go.jp/api/iiif/3437686/canvas/2",
        "type": "Canvas",
        "width": 6890,
        "height": 4706,
        "label": {
          "none": [
            "2"
          ]
        },
        "items": [
          {
            "id": "https://dl.ndl.go.jp/api/iiif/3437686/canvas/2/page",
            "type": "AnnotationPage",
            "items": [
              {
                "id": "https://dl.ndl.go.jp/api/iiif/3437686/canvas/2/page/imageanno",
                "type": "Annotation",
                "motivation": "sc:painting",
                "target": "https://dl.ndl.go.jp/api/iiif/3437686/canvas/2",
                "body": {
                  "id": "https://dl.ndl.go.jp/api/iiif/3437686/R0000002/full/full/0/default.jpg",
                  "type": "Image",
                  "format": "image/jpeg",
                  "width": 6890,
                  "height": 4706,
                  "service": [
                    {
                      "id": "https://dl.ndl.go.jp/api/iiif/3437686/R0000002",
                      "type": "ImageService2",
                      "profile": "level2"
                    }
                  ]
                }
              }
            ]
          }
        ],
        "annotations": [
          {
            "id": "https://dl.ndl.go.jp/api/iiif/3437686/canvas/2/annos",
            "type": "AnnotationPage",
            "items": []
          }
        ]
      }
    ]
  }

Creating TEI/XML Files

I created a library that takes the annotated IIIF manifest file obtained above as input and creates TEI/XML files.

It can be used from the following configuration.

{
  "name": "convert",
  "version": "1.0.0",
  "description": "",
  "main": "index.js",
  "scripts": {
    "test": "echo \"Error: no test specified\" && exit 1"
  },
  "author": "",
  "license": "ISC",
  "dependencies": {
    "@nakamura196/iiif-to-tei": "^1.0.1",
    "glob": "^11.0.2"
  }
}

Place manifest files in the data/input folder and run the following to output TEI/XML files in the data/output folder.

// import { someFunction } from 'iiif-to-tei';
// or
// import { IIIFToTEIConverter } from '../src/index';
// const iiifToTei = require('@repo/iiif-to-tei');
const { IIIFToTEIConverter } = require('@nakamura196/iiif-to-tei');
const fs = require('fs');
const path = require('path');
const glob = require('glob');

const input_dir = "./data/input";
const output_dir = "./data/output";

// Create output directory if it doesn't exist
if (!fs.existsSync(output_dir)) {
    fs.mkdirSync(output_dir, { recursive: true });
    console.log(`Created output directory: ${output_dir}`);
}

// Get all JSON files in the input directory
const jsonFiles = glob.sync(path.join(input_dir, "*.json"));

// Process each JSON file
jsonFiles.forEach(jsonFile => {
    const jsonData = JSON.parse(fs.readFileSync(jsonFile, 'utf8'));
    // const xmlOutput = iiifToTei(jsonData);

        // Initialize the converter
        const converter = new IIIFToTEIConverter({
            includeImages: true,
            includeFacsimile: true
          });

          const teiXml = converter.convert(jsonData);

    // Generate output filename (replace .json with .xml)
    const outputFile = path.join(output_dir, path.basename(jsonFile, '.json') + '.xml');

    // Save XML file
    fs.writeFileSync(outputFile, teiXml);
    console.log(`Conversion complete: ${path.basename(jsonFile)} -> ${path.basename(outputFile)}`);
});

An example of the output TEI/XML file is as follows.

<?xml version="1.0" encoding="UTF-8"?>
<?xml-model href="http://www.tei-c.org/release/xml/tei/custom/schema/relaxng/tei_all.rng" type="application/xml" schematypens="http://relaxng.org/ns/structure/1.0"?>
<?xml-model href="http://www.tei-c.org/release/xml/tei/custom/schema/relaxng/tei_all.rng" type="application/xml" schematypens="http://purl.oclc.org/dsdl/schematron"?>
<TEI xmlns="http://www.tei-c.org/ns/1.0">
  <teiHeader>
    <fileDesc>
      <titleStmt>
        <title>ę ”ē•°ęŗę°ē‰©čŖž. 巻一</title>
      </titleStmt>
      <publicationStmt>
        <p>Converted from IIIF Manifest</p>
      </publicationStmt>
      <sourceDesc>
        <msDesc>
          <msIdentifier>
            <idno>https://dl.ndl.go.jp/api/iiif/3437686/manifest.json</idno>
          </msIdentifier>
        </msDesc>
      </sourceDesc>
    </fileDesc>
  </teiHeader>
  <text>
    <body>
      <div n="1">
        <ab type="line" corresp="#zone-0-0">äø€ćƒ»ć€‡ćƒ»ćƒ»ćƒ»ćƒ»ćƒ»ćƒ»äø€äø€äø€äø€ćƒ»ćƒ»ćƒ»ćƒ»ćƒ»ćƒ»ćƒ»ćƒ»ćƒ»ćƒ»ćƒ»ćƒ»ćƒ»ćƒ»ćƒ»ćƒ»ćƒ»ćƒ»ćƒ»ćƒ»ćƒ»ćƒ»ćƒ»ćƒ»ćƒ»ćƒ»ćƒ»ćƒ»ćƒ»ćƒ»ćƒ»ćƒ»ćƒ»ćƒ»ćƒ»ćƒ»ćƒ»ćƒ»ćƒ»ćƒ»ćƒ»ćƒ»ćƒ»ćƒ»ćƒ»ćƒ»ćƒ»ćƒ»ćƒ»ćƒ»ćƒ»ćƒ»ćƒ»ćƒ»ćƒ»ćƒ»ćƒ»</ab>
        <ab type="line" corresp="#zone-0-1">○〇</ab>
        <ab type="line" corresp="#zone-0-2">一一</ab>
        <ab type="line" corresp="#zone-0-3">ć‚¹ć€‡ć€‡ć€‡å…­ć€‡ć€‡ć€‡äø€ć€‡ć€‡ć€‡ć€‡ć€‡ć€‡ć€‡äø€äø€äø€ć€‡ć€‡ć€‡äø€äø€äø€äø€ć€‡ć€‡ć€‡ć€‡ć€‡ć€‡ć€‡ć€‡ć€‡ć€‡äø€äø€ćƒ»ć€‡ć€‡ćƒ»ćƒ»ćƒ»ćƒ»ćƒ»ćƒ»ćƒ»ć®ć€‡ć€‡ćƒ»ćƒ»ćƒ»ćƒ»äø€ćƒ»ćƒ»ćƒ»</ab>
        <ab type="line" corresp="#zone-0-4">ā–”ē‰ēƒā–”ā–”ā–”ā–”ā–”ā–”ā–”ā–”ā–”ā–”ā–”ā–”ā–”ā–”ā–”ā–”ā–”</ab>
        <ab type="line" corresp="#zone-0-5">〇〇〇〇〇〇〇〇〇〇〇〇〇〇〇〇〇〇〇〇〇〇〇〇〇一〇〇一〇〇〇</ab>
        <ab type="line" corresp="#zone-0-6">åŒę ”ē•°ęŗę°ē‰©å·»äø€</ab>
      </div>
      <div n="2"/>
    </body>
  </text>
  <facsimile sameAs="https://dl.ndl.go.jp/api/iiif/3437686/manifest.json">
    <surface sameAs="https://dl.ndl.go.jp/api/iiif/3437686/canvas/1" ulx="0" uly="0" lrx="6890" lry="4706">
      <graphic url="https://dl.ndl.go.jp/api/iiif/3437686/R0000001/full/full/0/default.jpg"/>
      <zone xml:id="zone-0-0" ulx="5270" uly="275" lrx="5384" lry="1210"/>
      <zone xml:id="zone-0-1" ulx="5293" uly="2009" lrx="5511" lry="2433"/>
      <zone xml:id="zone-0-2" ulx="5092" uly="3272" lrx="5155" lry="3352"/>
      <zone xml:id="zone-0-3" ulx="4375" uly="304" lrx="4478" lry="1779"/>
      <zone xml:id="zone-0-4" ulx="4375" uly="2853" lrx="4420" lry="3375"/>
      <zone xml:id="zone-0-5" ulx="4283" uly="2756" lrx="4346" lry="3008"/>
      <zone xml:id="zone-0-6" ulx="694" uly="499" lrx="1004" lry="3490"/>
    </surface>
    <surface sameAs="https://dl.ndl.go.jp/api/iiif/3437686/canvas/2" ulx="0" uly="0" lrx="6890" lry="4706">
      <graphic url="https://dl.ndl.go.jp/api/iiif/3437686/R0000002/full/full/0/default.jpg"/>
    </surface>
  </facsimile>
</TEI>

You can verify the output using Oxygen XML Editor as shown below.

Reference: Monorepo Development with Turborepo

For developing the npm package mentioned above, I used a monorepo with Turborepo.

The web app was developed using Next.js. It can be used from the following link.

The API can be verified through Swagger UI at the following link.

From Python, it can be used as follows.

import requests
import json
from typing import Optional, Dict, Any
from dataclasses import dataclass

@dataclass
class ConvertOptions:
    """Conversion options"""
    include_images: bool = False
    include_facsimile: bool = False
    base_url: Optional[str] = None

class IIIFToTEIClient:
    """IIIF to TEI conversion API client"""

    def __init__(self, api_base_url: str):
        """
        Args:
            api_base_url: API base URL (e.g., "http://localhost:3000")
        """
        self.api_base_url = api_base_url.rstrip('/')
        self.convert_endpoint = f"{self.api_base_url}/api/convert"

    def convert_from_manifest(self,
                             manifest_object: Dict[str, Any],
                             options: Optional[ConvertOptions] = None) -> str:
        """
        Convert from IIIF manifest object to TEI XML

        Args:
            manifest_object: IIIF manifest object
            options: Conversion options

        Returns:
            Converted TEI XML string
        """
        payload = {
            "manifest": manifest_object
        }

        if options:
            payload["options"] = {
                "includeImages": options.include_images,
                "includeFacsimile": options.include_facsimile,
                "baseUrl": options.base_url
            }

        return self._make_request(payload)

    def _make_request(self, payload: Dict[str, Any]) -> str:
        """
        Execute API request

        Args:
            payload: Request payload

        Returns:
            Converted TEI XML string
        """
        try:
            response = requests.post(
                self.convert_endpoint,
                json=payload,
                headers={
                    'Content-Type': 'application/json'
                },
                timeout=30
            )

            # Check for HTTP errors
            response.raise_for_status()

            # Parse response as JSON
            result = response.json()

            # Check for error response
            if not result.get('success', False):
                error_msg = result.get('error', 'Unknown error')
                details = result.get('details', '')
                raise ValueError(f"API Error: {error_msg}. Details: {details}")

            return result.get('teiXml', '')

        except requests.exceptions.RequestException as e:
            raise requests.RequestException(f"API request failed: {str(e)}")
        except json.JSONDecodeError as e:
            raise ValueError(f"Invalid JSON response: {str(e)}")
# Initialize client
client = IIIFToTEIClient("https://iiif-tei-monorepo-web.vercel.app")

# Example 3: Convert from manifest object
try:
    # Set the actual manifest object here
    manifest_object = {
        "@context": "http://iiif.io/api/presentation/3/context.json",
        "id": "https://dl.ndl.go.jp/api/iiif/3437686/manifest.json",
        "type": "Manifest",
        ...
    }

    tei_xml = client.convert_from_manifest(manifest_object)
    print("Conversion from manifest object successful!")

    # Save result to file
    with open("output.xml", "w", encoding="utf-8") as f:
        f.write(tei_xml)

except Exception as e:
    print(f"Error: {e}")

Summary

I introduced the workflow of creating TEI/XML files from OCR text using NDL Klasseki OCR-Lite.

In the future, I would like to build a system that completes the process in a single app without going through multiple apps as described above.

There are various areas that need improvement, but I hope some parts serve as a useful reference.