Notice
I have created a more accessible article explaining the workflow introduced in this article. Please also refer to the following.

Overview
I would like to introduce a prototype tool for creating annotated IIIF manifest files and TEI/XML files using NDL Klasseki OCR-Lite.
Creating Annotated IIIF Manifest Files
First, I created a Gradio app that takes an IIIF manifest file as input and outputs an annotated IIIF manifest file using NDL Klasseki OCR-Lite. It is published using Hugging Face Spaces.


As output, you get an annotated IIIF manifest file like the following.
{
"@context": "http://iiif.io/api/presentation/3/context.json",
"id": "https://dl.ndl.go.jp/api/iiif/3437686/manifest.json",
"type": "Manifest",
"label": {
"none": [
"ę ”ē°ęŗę°ē©čŖ. å·»äø"
]
},
"items": [
{
"id": "https://dl.ndl.go.jp/api/iiif/3437686/canvas/1",
"type": "Canvas",
"width": 6890,
"height": 4706,
"label": {
"none": [
"1"
]
},
"items": [
{
"id": "https://dl.ndl.go.jp/api/iiif/3437686/canvas/1/page",
"type": "AnnotationPage",
"items": [
{
"id": "https://dl.ndl.go.jp/api/iiif/3437686/canvas/1/page/imageanno",
"type": "Annotation",
"motivation": "sc:painting",
"target": "https://dl.ndl.go.jp/api/iiif/3437686/canvas/1",
"body": {
"id": "https://dl.ndl.go.jp/api/iiif/3437686/R0000001/full/full/0/default.jpg",
"type": "Image",
"format": "image/jpeg",
"width": 6890,
"height": 4706,
"service": [
{
"id": "https://dl.ndl.go.jp/api/iiif/3437686/R0000001",
"type": "ImageService2",
"profile": "level2"
}
]
}
}
]
}
],
"annotations": [
{
"id": "https://dl.ndl.go.jp/api/iiif/3437686/canvas/1/annos",
"type": "AnnotationPage",
"items": [
{
"id": "https://dl.ndl.go.jp/api/iiif/3437686/canvas/1/annos/0",
"type": "Annotation",
"motivation": "commenting",
"target": "https://dl.ndl.go.jp/api/iiif/3437686/canvas/1#xywh=5270,275,114,935",
"body": {
"type": "TextualBody",
"value": "äøć»ćć»ć»ć»ć»ć»ć»äøäøäøäøć»ć»ć»ć»ć»ć»ć»ć»ć»ć»ć»ć»ć»ć»ć»ć»ć»ć»ć»ć»ć»ć»ć»ć»ć»ć»ć»ć»ć»ć»ć»ć»ć»ć»ć»ć»ć»ć»ć»ć»ć»ć»ć»ć»ć»ć»ć»ć»ć»ć»ć»ć»ć»ć»ć»ć»ć»"
}
},
{
"id": "https://dl.ndl.go.jp/api/iiif/3437686/canvas/1/annos/1",
"type": "Annotation",
"motivation": "commenting",
"target": "https://dl.ndl.go.jp/api/iiif/3437686/canvas/1#xywh=5293,2009,218,424",
"body": {
"type": "TextualBody",
"value": "āć"
}
},
{
"id": "https://dl.ndl.go.jp/api/iiif/3437686/canvas/1/annos/2",
"type": "Annotation",
"motivation": "commenting",
"target": "https://dl.ndl.go.jp/api/iiif/3437686/canvas/1#xywh=5092,3272,63,80",
"body": {
"type": "TextualBody",
"value": "äøäø"
}
},
{
"id": "https://dl.ndl.go.jp/api/iiif/3437686/canvas/1/annos/3",
"type": "Annotation",
"motivation": "commenting",
"target": "https://dl.ndl.go.jp/api/iiif/3437686/canvas/1#xywh=4375,304,103,1475",
"body": {
"type": "TextualBody",
"value": "ć¹ćććå
ćććäøćććććććäøäøäøćććäøäøäøäøććććććććććäøäøć»ććć»ć»ć»ć»ć»ć»ć»ć®ććć»ć»ć»ć»äøć»ć»ć»"
}
},
{
"id": "https://dl.ndl.go.jp/api/iiif/3437686/canvas/1/annos/4",
"type": "Annotation",
"motivation": "commenting",
"target": "https://dl.ndl.go.jp/api/iiif/3437686/canvas/1#xywh=4375,2853,45,522",
"body": {
"type": "TextualBody",
"value": "ā”ēēā”ā”ā”ā”ā”ā”ā”ā”ā”ā”ā”ā”ā”ā”ā”ā”ā”"
}
},
{
"id": "https://dl.ndl.go.jp/api/iiif/3437686/canvas/1/annos/5",
"type": "Annotation",
"motivation": "commenting",
"target": "https://dl.ndl.go.jp/api/iiif/3437686/canvas/1#xywh=4283,2756,63,252",
"body": {
"type": "TextualBody",
"value": "ćććććććććććććććććććććććććäøććäøććć"
}
},
{
"id": "https://dl.ndl.go.jp/api/iiif/3437686/canvas/1/annos/6",
"type": "Annotation",
"motivation": "commenting",
"target": "https://dl.ndl.go.jp/api/iiif/3437686/canvas/1#xywh=694,499,310,2991",
"body": {
"type": "TextualBody",
"value": "åę ”ē°ęŗę°ē©å·»äø"
}
}
]
}
]
},
{
"id": "https://dl.ndl.go.jp/api/iiif/3437686/canvas/2",
"type": "Canvas",
"width": 6890,
"height": 4706,
"label": {
"none": [
"2"
]
},
"items": [
{
"id": "https://dl.ndl.go.jp/api/iiif/3437686/canvas/2/page",
"type": "AnnotationPage",
"items": [
{
"id": "https://dl.ndl.go.jp/api/iiif/3437686/canvas/2/page/imageanno",
"type": "Annotation",
"motivation": "sc:painting",
"target": "https://dl.ndl.go.jp/api/iiif/3437686/canvas/2",
"body": {
"id": "https://dl.ndl.go.jp/api/iiif/3437686/R0000002/full/full/0/default.jpg",
"type": "Image",
"format": "image/jpeg",
"width": 6890,
"height": 4706,
"service": [
{
"id": "https://dl.ndl.go.jp/api/iiif/3437686/R0000002",
"type": "ImageService2",
"profile": "level2"
}
]
}
}
]
}
],
"annotations": [
{
"id": "https://dl.ndl.go.jp/api/iiif/3437686/canvas/2/annos",
"type": "AnnotationPage",
"items": []
}
]
}
]
}
Creating TEI/XML Files
I created a library that takes the annotated IIIF manifest file obtained above as input and creates TEI/XML files.
It can be used from the following configuration.
{
"name": "convert",
"version": "1.0.0",
"description": "",
"main": "index.js",
"scripts": {
"test": "echo \"Error: no test specified\" && exit 1"
},
"author": "",
"license": "ISC",
"dependencies": {
"@nakamura196/iiif-to-tei": "^1.0.1",
"glob": "^11.0.2"
}
}
Place manifest files in the data/input folder and run the following to output TEI/XML files in the data/output folder.
// import { someFunction } from 'iiif-to-tei';
// or
// import { IIIFToTEIConverter } from '../src/index';
// const iiifToTei = require('@repo/iiif-to-tei');
const { IIIFToTEIConverter } = require('@nakamura196/iiif-to-tei');
const fs = require('fs');
const path = require('path');
const glob = require('glob');
const input_dir = "./data/input";
const output_dir = "./data/output";
// Create output directory if it doesn't exist
if (!fs.existsSync(output_dir)) {
fs.mkdirSync(output_dir, { recursive: true });
console.log(`Created output directory: ${output_dir}`);
}
// Get all JSON files in the input directory
const jsonFiles = glob.sync(path.join(input_dir, "*.json"));
// Process each JSON file
jsonFiles.forEach(jsonFile => {
const jsonData = JSON.parse(fs.readFileSync(jsonFile, 'utf8'));
// const xmlOutput = iiifToTei(jsonData);
// Initialize the converter
const converter = new IIIFToTEIConverter({
includeImages: true,
includeFacsimile: true
});
const teiXml = converter.convert(jsonData);
// Generate output filename (replace .json with .xml)
const outputFile = path.join(output_dir, path.basename(jsonFile, '.json') + '.xml');
// Save XML file
fs.writeFileSync(outputFile, teiXml);
console.log(`Conversion complete: ${path.basename(jsonFile)} -> ${path.basename(outputFile)}`);
});
An example of the output TEI/XML file is as follows.
<?xml version="1.0" encoding="UTF-8"?>
<?xml-model href="http://www.tei-c.org/release/xml/tei/custom/schema/relaxng/tei_all.rng" type="application/xml" schematypens="http://relaxng.org/ns/structure/1.0"?>
<?xml-model href="http://www.tei-c.org/release/xml/tei/custom/schema/relaxng/tei_all.rng" type="application/xml" schematypens="http://purl.oclc.org/dsdl/schematron"?>
<TEI xmlns="http://www.tei-c.org/ns/1.0">
<teiHeader>
<fileDesc>
<titleStmt>
<title>ę ”ē°ęŗę°ē©čŖ. å·»äø</title>
</titleStmt>
<publicationStmt>
<p>Converted from IIIF Manifest</p>
</publicationStmt>
<sourceDesc>
<msDesc>
<msIdentifier>
<idno>https://dl.ndl.go.jp/api/iiif/3437686/manifest.json</idno>
</msIdentifier>
</msDesc>
</sourceDesc>
</fileDesc>
</teiHeader>
<text>
<body>
<div n="1">
<ab type="line" corresp="#zone-0-0">äøć»ćć»ć»ć»ć»ć»ć»äøäøäøäøć»ć»ć»ć»ć»ć»ć»ć»ć»ć»ć»ć»ć»ć»ć»ć»ć»ć»ć»ć»ć»ć»ć»ć»ć»ć»ć»ć»ć»ć»ć»ć»ć»ć»ć»ć»ć»ć»ć»ć»ć»ć»ć»ć»ć»ć»ć»ć»ć»ć»ć»ć»ć»ć»ć»ć»ć»</ab>
<ab type="line" corresp="#zone-0-1">āć</ab>
<ab type="line" corresp="#zone-0-2">äøäø</ab>
<ab type="line" corresp="#zone-0-3">ć¹ćććå
ćććäøćććććććäøäøäøćććäøäøäøäøććććććććććäøäøć»ććć»ć»ć»ć»ć»ć»ć»ć®ććć»ć»ć»ć»äøć»ć»ć»</ab>
<ab type="line" corresp="#zone-0-4">ā”ēēā”ā”ā”ā”ā”ā”ā”ā”ā”ā”ā”ā”ā”ā”ā”ā”ā”</ab>
<ab type="line" corresp="#zone-0-5">ćććććććććććććććććććććććććäøććäøććć</ab>
<ab type="line" corresp="#zone-0-6">åę ”ē°ęŗę°ē©å·»äø</ab>
</div>
<div n="2"/>
</body>
</text>
<facsimile sameAs="https://dl.ndl.go.jp/api/iiif/3437686/manifest.json">
<surface sameAs="https://dl.ndl.go.jp/api/iiif/3437686/canvas/1" ulx="0" uly="0" lrx="6890" lry="4706">
<graphic url="https://dl.ndl.go.jp/api/iiif/3437686/R0000001/full/full/0/default.jpg"/>
<zone xml:id="zone-0-0" ulx="5270" uly="275" lrx="5384" lry="1210"/>
<zone xml:id="zone-0-1" ulx="5293" uly="2009" lrx="5511" lry="2433"/>
<zone xml:id="zone-0-2" ulx="5092" uly="3272" lrx="5155" lry="3352"/>
<zone xml:id="zone-0-3" ulx="4375" uly="304" lrx="4478" lry="1779"/>
<zone xml:id="zone-0-4" ulx="4375" uly="2853" lrx="4420" lry="3375"/>
<zone xml:id="zone-0-5" ulx="4283" uly="2756" lrx="4346" lry="3008"/>
<zone xml:id="zone-0-6" ulx="694" uly="499" lrx="1004" lry="3490"/>
</surface>
<surface sameAs="https://dl.ndl.go.jp/api/iiif/3437686/canvas/2" ulx="0" uly="0" lrx="6890" lry="4706">
<graphic url="https://dl.ndl.go.jp/api/iiif/3437686/R0000002/full/full/0/default.jpg"/>
</surface>
</facsimile>
</TEI>
You can verify the output using Oxygen XML Editor as shown below.

Reference: Monorepo Development with Turborepo
For developing the npm package mentioned above, I used a monorepo with Turborepo.
The web app was developed using Next.js. It can be used from the following link.

The API can be verified through Swagger UI at the following link.

From Python, it can be used as follows.
import requests
import json
from typing import Optional, Dict, Any
from dataclasses import dataclass
@dataclass
class ConvertOptions:
"""Conversion options"""
include_images: bool = False
include_facsimile: bool = False
base_url: Optional[str] = None
class IIIFToTEIClient:
"""IIIF to TEI conversion API client"""
def __init__(self, api_base_url: str):
"""
Args:
api_base_url: API base URL (e.g., "http://localhost:3000")
"""
self.api_base_url = api_base_url.rstrip('/')
self.convert_endpoint = f"{self.api_base_url}/api/convert"
def convert_from_manifest(self,
manifest_object: Dict[str, Any],
options: Optional[ConvertOptions] = None) -> str:
"""
Convert from IIIF manifest object to TEI XML
Args:
manifest_object: IIIF manifest object
options: Conversion options
Returns:
Converted TEI XML string
"""
payload = {
"manifest": manifest_object
}
if options:
payload["options"] = {
"includeImages": options.include_images,
"includeFacsimile": options.include_facsimile,
"baseUrl": options.base_url
}
return self._make_request(payload)
def _make_request(self, payload: Dict[str, Any]) -> str:
"""
Execute API request
Args:
payload: Request payload
Returns:
Converted TEI XML string
"""
try:
response = requests.post(
self.convert_endpoint,
json=payload,
headers={
'Content-Type': 'application/json'
},
timeout=30
)
# Check for HTTP errors
response.raise_for_status()
# Parse response as JSON
result = response.json()
# Check for error response
if not result.get('success', False):
error_msg = result.get('error', 'Unknown error')
details = result.get('details', '')
raise ValueError(f"API Error: {error_msg}. Details: {details}")
return result.get('teiXml', '')
except requests.exceptions.RequestException as e:
raise requests.RequestException(f"API request failed: {str(e)}")
except json.JSONDecodeError as e:
raise ValueError(f"Invalid JSON response: {str(e)}")
# Initialize client
client = IIIFToTEIClient("https://iiif-tei-monorepo-web.vercel.app")
# Example 3: Convert from manifest object
try:
# Set the actual manifest object here
manifest_object = {
"@context": "http://iiif.io/api/presentation/3/context.json",
"id": "https://dl.ndl.go.jp/api/iiif/3437686/manifest.json",
"type": "Manifest",
...
}
tei_xml = client.convert_from_manifest(manifest_object)
print("Conversion from manifest object successful!")
# Save result to file
with open("output.xml", "w", encoding="utf-8") as f:
f.write(tei_xml)
except Exception as e:
print(f"Error: {e}")
Summary
I introduced the workflow of creating TEI/XML files from OCR text using NDL Klasseki OCR-Lite.
In the future, I would like to build a system that completes the process in a single app without going through multiple apps as described above.
There are various areas that need improvement, but I hope some parts serve as a useful reference.


Comments
ā¦