Prepare #9 (handover)

webis-de · May 3, 2023 · d4cbe34 · d4cbe34
1 parent 7ecaa30
commit d4cbe34
Show file tree

Hide file tree

Showing 3 changed files with 62 additions and 6 deletions.
diff --git a/Dockerfile b/Dockerfile
@@ -0,0 +1,10 @@
+FROM webis/ir-axioms:1.0.0-base
+
+RUN pip3 uninstall -y tira ir_axioms && \
+	pip3 install git+https://github.com/tira-io/tira.git@development#subdirectory=python-client && \
+	rm -Rf /ir_axioms
+
+COPY . /ir_axioms
+
+RUN cd /ir_axioms && pip install -e .[pyterrier]
+
diff --git a/README.md b/README.md
@@ -41,6 +41,8 @@ Relevance Judgments](examples/pyterrier_post_hoc_analysis_of_runs_and_qrels.ipyn
 - [Axiomatic prefernces for TREC Deep Learning 2020 runs (documents)](examples/trec_29_deep_documents_preferences_depth_10.ipynb)
   [![Launch Binder](https://img.shields.io/badge/launch-binder-informational?style=flat-square)](https://mybinder.org/v2/gh/webis-de/ir_axioms/main?labpath=examples/trec_29_deep_documents_preferences_depth_10.ipynb)
 
+### Usage with Docker
+
 ### Backends
 
 TODO
@@ -58,12 +60,19 @@ scripts/slurm-start-jupyter-lab.sh
 If you use this package or its components in your research, please cite the following paper describing the `ir_axioms`
 framework and its use-cases:
 
-> TODO
-
-You can use the following BibTeX entry for citation:
-
 ```bibtex
-@InProceedings{TODO,
+@InProceedings{bondarenko:2022d,
+  author =                   {Alexander Bondarenko and Maik Fr{\"o}be and {Jan Heinrich} Reimer and Benno Stein and Michael V{\"o}lske and Matthias Hagen},
+  booktitle =                {45th International ACM Conference on Research and Development in Information Retrieval (SIGIR 2022)},
+  doi =                      {10.1145/3477495.3531743},
+  editor =                   {Enrique Amig{\'{o}} and Pablo Castells and Julio Gonzalo and Ben Carterette and J. Shane Culpepper and Gabriella Kazai},
+  month =                    jul,
+  pages =                    {3131-3140},
+  publisher =                {ACM},
+  site =                     {Madrid, Spain},
+  title =                    {{Axiomatic Retrieval Experimentation with ir{\_}axioms}},
+  url =                      {https://dl.acm.org/doi/10.1145/3477495.3531743},
+  year =                     2022
 }
 ```
 
@@ -77,6 +86,15 @@ pip install build setuptools wheel
 
 (On most systems, these packages are already pre-installed.)
 
+Build Docker image via:
+
+```
+docker build -t webis/ir-axioms:1.0.0 .
+docker push webis/ir-axioms:1.0.0
+```
+
+`docker run --rm -ti -p 8888:8888 -v ${PWD}:/workspace -w /workspace -v /home/maik/.tira:/root/.tira/ --entrypoint jupyter webis/ir-axioms:1.0.0 notebook --ip 0.0.0.0 --allow-root`
+
 ### Installation
 
 Install dependencies for developing the `ir_axioms` package:

diff --git a/ir_axioms/backend/pyterrier/__init__.py b/ir_axioms/backend/pyterrier/__init__.py
@@ -140,6 +140,9 @@ def document_frequency(self, term: str) -> int:
     @lru_cache(None)
     def _document_contents(self, document_id: str) -> str:
         # Shortcut when ir_dataset is specified.
+        if not self.contents_accessor:
+            raise ValueError('No content accessor is given. Please set "contents_accessor".')
+
         documents_store = self._dataset_doc_store
         if documents_store is not None:
             try:
@@ -193,9 +196,15 @@ def _tokeniser(self) -> Tokeniser:
 
     @cached_property
     def _term_pipelines(self) -> Sequence[TermPipelineAccessor]:
+        default_termpipelines = "Stopwords,PorterStemmer"
+        if self._index:
+            from jnius import cast
+            default_termpipelines = cast('org.terrier.structures.PropertiesIndex', index).getProperties().get('termpipelines')
+            print('I will use the termpipelines "{default_termpipelines}" as configured in the PyTerrier index as default (can be overridden in ApplicationSetup')
+
         term_pipelines = str(ApplicationSetup.getProperty(
             "termpipelines",
-            "Stopwords,PorterStemmer"
+            default_termpipelines
         ))
         return tuple(
             BaseTermPipelineAccessor(pipeline)
@@ -224,6 +233,7 @@ def terms(
             self,
             query_or_document: Union[Query, Document]
     ) -> Sequence[str]:
+
         text = self.contents(query_or_document)
         return self._terms(text)
 
@@ -313,3 +323,21 @@ def retrieval_score(
             document.id,
             _weighting_model(model)
         )
+
+
+@dataclass(frozen=True)
+class TerrierIndexOnlyContext(TerrierIndexContext):
+    index_location: Union[Path, IndexRef, Index]
+    tokeniser: Optional[Tokeniser] = None
+    cache_dir: Optional[Path] = None
+
+    def terms(
+            self,
+            query_or_document: Union[Query, Document]
+    ) -> Sequence[str]:
+        raise ValueError('Not Possible')
+
+    def document_contents(self, document: Document) -> str:
+        raise ValueError('Not Possible')
+
+