Merge pull request #1 from Minitour/feature/code-enhancements

Minitour · web-flow · commit fc0c82e99a8a · 2025-01-03T21:11:56.000+02:00
Code refactoring
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
@@ -1,12 +1,16 @@
-name: Publish Python Package
+name: CI Pipeline
 
 on:
   push:
+    branches: # Run on pushes to any branch
+      - '*'
+  pull_request: # Run on pull requests to any branch
     branches:
-      - master
+      - '*'
 
 jobs:
-  build:
+  test:
+    name: Run Unit Tests
     runs-on: ubuntu-latest
 
     steps:
@@ -15,13 +19,38 @@ jobs:
     - name: Set up Python
       uses: actions/setup-python@v2
       with:
-        python-version: '3.9'
+        python-version: '3.12'
 
     - name: Install dependencies
       run: |
         python -m pip install --upgrade pip
         pip install poetry
-        poetry install
+        poetry install --with test
+
+    - name: Run Tests
+      run: |
+        poetry run pytest
+        
+
+  release:
+    name: Publish Python Package
+    needs: test  # Ensure tests pass before publishing
+    runs-on: ubuntu-latest
+    if: github.ref == 'refs/heads/master'
+
+    steps:
+    - uses: actions/checkout@v2
+
+    - name: Set up Python
+      uses: actions/setup-python@v2
+      with:
+        python-version: '3.12'
+
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install poetry
+        poetry install 
 
     - name: Publish package
       env:
diff --git a/README.md b/README.md
@@ -42,13 +42,25 @@ model = ChatGptModelParaphrase(api_key='sk-xyz', model='gpt-4o', temperature=0.7
 
 ```python
 from verbalizer.process import Processor
+from verbalizer.vocabulary import Vocabulary
+from verbalizer import Verbalizer
 
-ontology = 'pizza.ttl'
-name = 'pizza'
-processor = Processor(llm=model, vocab_ignore=ignore, vocab_rephrased=rephrased, min_statements=1)
-processor.process(name, ontology, output_dir='/path/to/my/output')
+ontology = Processor.from_file('pizza.ttl')
+
+# create vocabulary and verbalizer
+vocab = Vocabulary(ontology, ignore=ignore, rephrased=rephrased)
+verbalizer = Verbalizer(vocab)
+
+# start verbalization process
+results = Processor.verbalize_with(verbalizer, namespace="pizza", output_dir="./output")
 ```
 
+## Examples
+
+<details>
+
+<summary>Expand to see examples</summary>
+
 <table border="1">
     <tr>
         <th>OWL Fragment</th>
@@ -175,4 +187,21 @@ processor.process(name, ontology, output_dir='/path/to/my/output')
             Chicken topping is a type of meat topping that has at least some mild spiciness. It is different from pepperoni sausage topping, hot spiced beef topping, and ham topping.
         </td>
     </tr>
-</table>
+</table>
+
+</details>
+
+
+## Citation
+
+```
+@inproceedings{zaitoun2024generating,
+  title={Generating Ontology-Learning Training-Data through Verbalization},
+  author={Zaitoun, Antonio and Sagi, Tomer and Peleg, Mor},
+  booktitle={Proceedings of the AAAI Symposium Series},
+  volume={4},
+  number={1},
+  pages={233--241},
+  year={2024}
+}
+```
diff --git a/evaluations/processed.py b/evaluations/processed.py
@@ -113,4 +113,4 @@ def test_evaluation(self):
             for ontology_name, contents in ontologies.items():
                 file = contents['file']
                 sampler = CustomSampler(samples=contents['samples'])
-                processor.process(ontology_name, file, data_sampler=sampler)
+                processor.verbalize_with(ontology_name, file, sampler=sampler)
diff --git a/playground.py b/playground.py
@@ -4,6 +4,8 @@
 from verbalizer.nlp import ChatGptModelParaphrase, LlamaModelParaphrase
 from verbalizer.process import Processor
 from verbalizer.sampler import Sampler
+from verbalizer.verbalizer import Verbalizer
+from verbalizer.vocabulary import Vocabulary
 
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
@@ -87,9 +89,19 @@
 
     sampler = Sampler(sample_n=100, seed=42)
 
+    ontologies = [
+        ('people', Processor.from_file('./data/people.ttl')),
+        ('pizza', Processor.from_file('./data/pizza.ttl')),
+        ('mondo', Processor.from_file('./data/mondo.owl')),
+        ('fma', Processor.from_file('./data/fma.owl')),
+    ]
+
+    vocabularies = [
+        (namespace, Vocabulary(ontology, ignore=ignore, rephrased=rephrased))
+        for namespace, ontology in ontologies
+    ]
+
     for model in models:
-        processor = Processor(llm=model, vocab_ignore=ignore, vocab_rephrased=rephrased, min_statements=1)
-        processor.process('people', './data/people.ttl')
-        processor.process('pizza', './data/pizza.ttl')
-        processor.process('mondo', './data/mondo.owl', data_sampler=sampler)
-        processor.process('fma', './data/fma.owl', data_sampler=sampler)
+        for namespace, vocabulary in vocabularies:
+            verbalizer = Verbalizer(vocabulary, language_model=model)
+            results = Processor.verbalize_with(verbalizer, namespace=namespace, output_dir="./output", sampler=sampler)
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "ontology-verbalizer"
-version = "1.0.1"
+version = "1.1.0"
 description = "A Python package for ontology verbalization"
 authors = ["Antonio Zaitoun <tony.z.1711@gmail.com>"]
 license = "MIT"
@@ -10,12 +10,15 @@ packages = [
 ]
 repository = "https://github.com/Minitour/ontology-verbalizer"
 [tool.poetry.dependencies]
-python = "^3.9"
+python = "^3.12"
 rdflib = "~7.0.0"
 openai = "~1.12.0"
 pandas = "~2.2.0"
 tqdm = "~4.66.2"
 
+[tool.poetry.group.test.dependencies]
+pytest = "~8.3.4"
+
 [build-system]
 requires = ["poetry-core>=1.0.0"]
 build-backend = "poetry.core.masonry.api"
diff --git a/tests/__init__.py b/tests/__init__.py
diff --git a/tests/test_verbalization.py b/tests/test_verbalization.py
@@ -0,0 +1,66 @@
+import unittest
+
+from rdflib import Graph
+
+from verbalizer.process import Processor
+from verbalizer.sampler import Sampler
+from verbalizer.vocabulary import Vocabulary
+from verbalizer import Verbalizer
+
+rename_iri = {
+    'http://www.w3.org/2002/07/owl#equivalentClass': 'is same as',
+    'http://www.w3.org/2000/01/rdf-schema#subClassOf': 'is a type of',
+    'http://www.w3.org/2002/07/owl#intersectionOf': 'all of',
+    'http://www.w3.org/2002/07/owl#unionOf': 'any of',
+    'http://www.w3.org/2002/07/owl#disjointWith': 'is different from',
+    'http://www.w3.org/2002/07/owl#withRestrictions': 'must be'
+}
+ignore_iri = {
+    'http://www.w3.org/2002/07/owl#onDatatype',
+    'http://www.w3.org/2000/01/rdf-schema#seeAlso',
+    'http://www.w3.org/2000/01/rdf-schema#label',
+    'http://www.w3.org/2000/01/rdf-schema#comment',
+    'http://www.w3.org/1999/02/22-rdf-syntax-ns#type',
+    'http://www.w3.org/2000/01/rdf-schema#isDefinedBy',
+    'http://www.w3.org/2003/06/sw-vocab-status/ns#term_status',
+    'http://www.w3.org/2000/01/rdf-schema#Class'
+}
+
+
+class TestVerbalization(unittest.TestCase):
+
+    def test_verbalization(self):
+        # graph
+        ontology = Processor.from_file('./data/foaf.owl')
+
+        # create vocabulary
+        vocab = Vocabulary(ontology, ignore=ignore_iri, rephrased=rename_iri)
+
+        # create verbalizer
+        verbalizer = Verbalizer(vocab)
+
+        results = Processor.verbalize_with(verbalizer, namespace='foaf')
+        self.assertEqual(12, len(results))
+
+        # Add default prefix (won't work without this)
+        fragment_sample = '@prefix : <https://zaitoun.dev#> .\n' + results[0]['fragment']
+        g = Graph()
+        g.parse(data=fragment_sample, format="turtle")
+
+        self.assertEqual(7, len(list(g.triples((None, None, None)))))
+
+    def test_verbalization_with_sampler(self):
+        # graph
+        ontology = Processor.from_file('./data/foaf.owl')
+
+        # create vocabulary
+        vocab = Vocabulary(ontology, ignore=ignore_iri, rephrased=rename_iri)
+
+        # create verbalizer
+        verbalizer = Verbalizer(vocab)
+
+        sampler = Sampler(sample_n=10, seed=42)
+        results = Processor.verbalize_with(verbalizer, namespace='foaf', sampler=sampler)
+
+        # although we sampled 10, only 7 were applicable.
+        self.assertEqual(7, len(results))
diff --git a/verbalizer/__init__.py b/verbalizer/__init__.py
@@ -0,0 +1 @@
+from .verbalizer import Verbalizer
diff --git a/verbalizer/patterns/owl_disjoint.py b/verbalizer/patterns/owl_disjoint.py
@@ -1,7 +1,7 @@
 from rdflib import URIRef
 
 from verbalizer.patterns import Pattern
-from verbalizer.verbalizer import VerbalizationNode, VerbalizationEdge
+from verbalizer.verbalizer import VerbalizationNode, VerbalizationEdge, default_patterns
 from verbalizer.vocabulary import Vocabulary
 
 
@@ -46,7 +46,8 @@ def normalize(self, node: VerbalizationNode, triple_collector):
             relation_display = self.vocab.get_relationship_label(relation)
 
             if relation_display == Vocabulary.IGNORE_VALUE:
-                triple_collector.append((node.concept, relation, obj))
+                if self.vocab.should_keep(relation):
+                    triple_collector.append((node.concept, relation, obj))
                 continue
 
             next_node = VerbalizationNode(obj, parent_path=node.get_parent_path() + [(node.concept, relation)])
@@ -56,3 +57,5 @@ def normalize(self, node: VerbalizationNode, triple_collector):
             triple_collector.append((node.concept, relation, obj))
 
         return [(reference.relationship, reference.node.concept) for reference in node.references]
+
+default_patterns.append(OwlDisjointWith)
diff --git a/verbalizer/patterns/owl_first_rest.py b/verbalizer/patterns/owl_first_rest.py
@@ -1,7 +1,7 @@
 from verbalizer.patterns import Pattern
 from rdflib import URIRef
 
-from verbalizer.verbalizer import VerbalizationNode, VerbalizationEdge
+from verbalizer.verbalizer import VerbalizationNode, VerbalizationEdge, default_patterns
 
 
 class OwlFirstRestPattern(Pattern):
@@ -37,3 +37,5 @@ def normalize(self, node: VerbalizationNode, triple_collector):
             current = rest_node
 
         return [(reference.relationship, reference.node.concept) for reference in node.references]
+
+default_patterns.append(OwlFirstRestPattern)
diff --git a/verbalizer/patterns/owl_restriction.py b/verbalizer/patterns/owl_restriction.py
@@ -1,7 +1,7 @@
 from rdflib import URIRef
 
 from verbalizer.patterns import Pattern
-from verbalizer.verbalizer import VerbalizationNode, VerbalizationEdge
+from verbalizer.verbalizer import VerbalizationNode, VerbalizationEdge, default_patterns
 
 
 class OwlRestrictionPattern(Pattern):
@@ -127,3 +127,5 @@ def _handle_cardinality(self, quantifier_relation, property_relation, obj_litera
             return f'has at least {literal_value}{on_class_label}{property_relation_label}{relation_plural_s}'
         elif quantifier_relation.endswith('maxCardinality') or quantifier_relation.endswith('maxQualifiedCardinality'):
             return f'has at most {literal_value}{on_class_label}{property_relation_label}{relation_plural_s}'
+
+default_patterns.append(OwlRestrictionPattern)
diff --git a/verbalizer/process.py b/verbalizer/process.py
diff --git a/verbalizer/verbalizer.py b/verbalizer/verbalizer.py
diff --git a/verbalizer/vocabulary.py b/verbalizer/vocabulary.py