diff --git a/CNAME b/CNAME
new file mode 100644
index 0000000..9e8df2a
--- /dev/null
+++ b/CNAME
@@ -0,0 +1 @@
+crossscore.active.vision
\ No newline at end of file
diff --git a/README.md b/README.md
deleted file mode 100644
index b07f7ed..0000000
--- a/README.md
+++ /dev/null
@@ -1 +0,0 @@
-# CrossScore
diff --git a/assets/00_teaser.png b/assets/00_teaser.png
new file mode 100644
index 0000000..c61ee12
Binary files /dev/null and b/assets/00_teaser.png differ
diff --git a/assets/01_method.png b/assets/01_method.png
new file mode 100644
index 0000000..ba32f16
Binary files /dev/null and b/assets/01_method.png differ
diff --git a/assets/02_ablation.png b/assets/02_ablation.png
new file mode 100644
index 0000000..f5e98cb
Binary files /dev/null and b/assets/02_ablation.png differ
diff --git a/assets/03_attn.png b/assets/03_attn.png
new file mode 100644
index 0000000..5d229aa
Binary files /dev/null and b/assets/03_attn.png differ
diff --git a/assets/04_main_results.png b/assets/04_main_results.png
new file mode 100644
index 0000000..c6c0503
Binary files /dev/null and b/assets/04_main_results.png differ
diff --git a/assets/additional_results.mp4 b/assets/additional_results.mp4
new file mode 100644
index 0000000..8c0d7ff
Binary files /dev/null and b/assets/additional_results.mp4 differ
diff --git a/index.html b/index.html
new file mode 100644
index 0000000..1c57365
--- /dev/null
+++ b/index.html
@@ -0,0 +1,308 @@
+<!doctype html>
+<html lang="en">
+  <head>
+    <meta charset="utf-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no">
+    <meta name="description" content="CrossScore: Towards Multi-View Image Evaluation and Scoring">
+    <meta name="author" content="Zirui Wang">
+    <meta name="generator" content="Jekyll v4.1.1">
+    
+    <title>CrossScore</title>
+
+    <!-- Bootstrap core CSS -->
+    <link 
+    href="https://cdn.jsdelivr.net/npm/bootstrap@5.3.3/dist/css/bootstrap.min.css" 
+    rel="stylesheet" 
+    integrity="sha384-QWTKZyjpPEjISv5WaRU9OFeRpok6YctnYmDr5pNlyT2bRjXh0JMhjY6hW+ALEwIH" 
+    crossorigin="anonymous">
+
+    <!-- Custom styles for this template -->
+    <link href="style.css" rel="stylesheet">
+  </head>
+
+  <body>
+</nav>
+
+
+
+<main role="main" class="container">
+
+  <div class="title">
+    <h1>CrossScore: Towards Multi-View Image Evaluation and Scoring</h1>
+  </div>
+
+  <div class="col text-center">
+    <p class="authors">
+      <a href="https://scholar.google.com/citations?user=zCBKqa8AAAAJ&hl=en">Zirui Wang<sup>1</sup></a>&nbsp;&nbsp;&nbsp;
+      <a href="https://scholar.google.com/citations?user=IVfbqkgAAAAJ&hl=en">Wenjing Bian<sup>1</sup></a>&nbsp;&nbsp;&nbsp;
+      <a href="https://scholar.google.co.uk/citations?user=tiLf8UkAAAAJ&hl=en">Omkar Parkhi<sup>2</sup></a>&nbsp;&nbsp;&nbsp;
+      <a href="https://scholar.google.co.uk/citations?user=Mf6PAuQAAAAJ&hl=en">Yuheng Ren<sup>2</sup></a>&nbsp;&nbsp;&nbsp;
+      <a href="http://www.robots.ox.ac.uk/~victor/">Victor Adrian Prisacariu<sup>1</sup></a>
+    </p>
+    <p class="institution">
+      <sup>1</sup>University of Oxford &nbsp;&nbsp;&nbsp; <sup>2</sup>Meta Reality Lab
+    </p>
+  </div>
+
+  <div class="col text-center">
+    <a class="btn btn-secondary" href="https://arxiv.org/abs/2404.14409" role="button">Arxiv</a>
+    <a class="btn btn-secondary" href="" role="button">Code (Comming Soon)</a>
+  </div>
+
+  <p>
+    <b>TLDR</b>:
+    This method evaluates an image by comparing it with multiple views of 
+    the same scene through cross-attention, eliminating the need for a 
+    pre-aligned ground truth image. 
+  </p>
+  <p>
+    <b>Application</b>: Evaluate rendered images from novel view 
+    synthesis (NVS) applications where ground truth references are unavailable.
+  </p>
+
+  <div class="col text-center">
+    <figure class="figure">
+      <embed src="assets/04_main_results.png" alt="main results", class="responsive-figure">
+      <figcaption class="figcaption_left">
+        We introduce an image assessment method that examines query images by 
+        referencing multiple views of the same scene, 
+        producing results termed <b>CrossScore</b> maps. 
+        
+        Our results show that CrossScore is closely correlated with SSIM 
+        across diverse datasets, without requiring pre-aligned 
+        ground truth images.
+        
+        Colour coding: 
+        <span style="color:brown;">red</span> represents the highest score, 
+        followed by 
+        <span style="color:orange;">orange</span>, 
+        <span style="color:green;">green</span>, and 
+        <span style="color:blue;">blue</span>, 
+        indicating decreasing scores respectively.
+      </figcaption>
+    </figure>
+  </div>
+
+
+  <h2>Abstract</h2>
+  <p>
+    We introduce a novel <i>Cross-Reference</i> image quality assessment 
+    method that effectively fills the gap in the image assessment 
+    landscape, complementing the array of established evaluation schemes -- 
+    ranging from
+    <i>Full-Reference</i> metrics like SSIM, 
+    <i>No-Reference</i> metrics such as NIQE, to 
+    <i>General-Reference</i> metrics including FID, and 
+    <i>Multi-Modal-Reference</i> metrics, <i>e.g.</i> CLIPScore.
+  </p>
+
+
+  <div class="col text-center">
+    <figure class="figure" style="max-width: 600px">
+      <embed src="assets/00_teaser.png" alt="IQA categories", class="responsive-figure">
+      <figcaption class="figcaption_left">
+        We propose a novel
+        <b><span style="color:orange;">cross-reference</span></b> (<b>CR</b>)
+        image quality assessment (IQA) scheme, which evaluates a query image 
+        using multiple unregistered reference images that are captured from 
+        different viewpoints. 
+        This approach sets a new research trajectory apart from conventional 
+        IQA schemes such as 
+        full-reference (<b>FR</b>), 
+        general-reference (<b>GR</b>), 
+        no-reference (<b>NR</b>), and 
+        multi-modal-reference (<b>MMR</b>).
+      </figcaption>
+    </figure>
+  </div>
+
+
+  <p>
+    Utilising a neural network with the cross-attention mechanism and a unique data collection 
+    pipeline from NVS optimisation, our method enables accurate image quality assessment without 
+    requiring ground truth references.
+    By comparing a query image against multiple views of the same scene, our method addresses 
+    the limitations of existing metrics in novel view synthesis (NVS) and similar tasks where 
+    direct reference images are unavailable.
+    Experimental results show that our method is closely correlated to the 
+    full-reference metric SSIM, while not requiring ground truth references.
+  </p>
+
+
+
+
+
+
+
+  <h2>Method</h2>
+  <p>
+  Our goal is to evaluate the quality of a query image, using a set of reference images 
+  that capture the same scene as the query image but from other viewpoints.
+  From the NVS application perspective, the query image is often a rendered image 
+  with artefacts, and the reference images consists of the real captured images.
+  </p>
+
+  <div class="col text-center">
+    <figure class="figure">
+      <embed src="assets/01_method.png" alt="Method Overview", class="responsive-figure">
+    
+      <figcaption class="figcaption_left">
+        Method Overview.
+        <b>Left</b>: Our NVS-based data engine that supplies query and reference images 
+        along with SSIM maps to drive the self-supervised training of our model.
+        <b>Right</b>: Our model that takes a query image and a set of reference images 
+        as input and predicts a score map for the query image.  
+      </figcaption>
+    </figure>
+  </div>
+  
+  <h3>Network</h3>
+  <p>
+    We propose a network that takes a query image and a set of reference images
+    and predict a dense score map for the query image. 
+    Our network consists of three components:
+  </p>
+  <ol>
+    <li> an image encoder which extracts feature maps from input images; </li>
+    <li> a cross-reference module that associates a query image with multi-view reference images; and </li>
+    <li> a score regression head that regresses a CrossScore for each pixel of the query image. </li>
+  </ol>
+  <p></p>
+    In practice, we adapt 
+    a pretrained DINOv2-small model as the image encoder, 
+    a Transformer Decoder for the cross-reference module, and 
+    a shallow MLP for the score regression head.
+  </p>
+
+  <h3>Self-supervised Training</h3>
+  <p>
+    We leverage existing NVS systems and abundant multi-view datasets to generate 
+    SSIM maps for our training.
+  </p>
+
+  <p>
+    Specifically, we select Neural Radiance Field (NeRF)-style NVS systems as 
+    our data engine.
+    Given a set of images, a NeRF recovers a neural representation of a scene by 
+    iteratively reconstructing the given image set with photometric losses.
+  </p>  
+  <p>
+    By rendering images with the camera parameters from the original captured 
+    image set at multiple NeRF training checkpoints, we generate a large number of 
+    images that contain various types of artefacts at various levels. 
+    From which, we compute SSIM maps between 
+    rendered images and corresponding real captured images, which serve as 
+    our training objectives.
+  </p>
+
+
+
+
+
+
+
+
+  <h2>Additional Results</h2>
+  <figure>
+    <video controls autoplay muted loop playsinline class="center_video">
+      <source src="assets/additional_results.mp4" type="video/mp4">
+      Your browser does not support the video tag.
+    </video>
+    <figcaption class="figcaption_left">
+      Evaluating images rendered from a popular NVS method (Gaussian-Splatting)
+      using CrossScore and SSIM.
+      CrossScore is highly correlated with SSIM, while not requiring
+      ground truth images.
+    </figcaption>
+
+  </figure>
+  
+
+
+
+
+  <h2> Ablation: Enable and Disable Reference Images </h2>
+  <p>
+    Here, we show our method effectively leverage reference views while 
+    evaluating a query image.
+    With reference images enabled (ON), the score map predicted 
+    by our method contains more details than when reference images 
+    are disabled (OFF), where the model tends to assign 
+    a high score everywhere.
+  </p>
+  <div class="col text-center">
+    <figure class="figure">
+      <embed src="assets/02_ablation.png" alt="Ablation", class="responsive-figure">
+      <figcaption>
+        Ablation study on the importance of reference images.
+      </figcaption>
+    </figure>
+  </div>
+
+
+
+
+
+  <h2> Attention Weights Visualisation </h2>
+  <p>
+    We further illustrate that our model indeed checking related context 
+    in reference images, as evidenced by the visualisation of attention maps below.
+  </p>
+  <div class="col text-center">
+    <figure class="figure">
+      <embed src="assets/03_attn.png" alt="BLEFF thumbnails", class="responsive-figure">
+      <figcaption class="figcaption_left">
+        Attention weights visualisation of our model.
+        <b>Top left</b>: a query image with a region of interest (centre of image) 
+        highlighted with a <span style="color:magenta;">magenta</span> box.
+        
+        <b>Right column</b>: three reference images from our cross-reference 
+        set with attention maps overlaid. The attention maps illustrate the attention 
+        that is paid to predicting image quality at the query region.
+        
+        <span style="color:red;">Red</span> and 
+        <span style="color:blue;">blue</span> denote high and low 
+        attention weights respectively. 
+        Note that we use 5 reference images in our experiment, 
+        but only 3 are shown due to space constraint.
+        
+        <b>Bottom</b>: Predicted CrossScore map and SSIM map. 
+        
+        <span style="color:red;">Red</span> and 
+        <span style="color:blue;">blue</span> denote high and low 
+        quality image regions respectively.
+      </figcaption>
+    </figure>
+  </div>
+
+
+
+
+
+  <h2> Acknowledgement </h2>
+  <p>
+    This research is supported by an <a href="https://facebookresearch.github.io/projectaria_tools/docs/intro">ARIA</a> 
+    research gift grant from Meta Reality Lab.
+    We gratefully thank 
+    <a href="https://elliottwu.com/">Shangzhe Wu</a>, 
+    <a href="https://tengdahan.github.io/">Tengda Han</a>, 
+    <a href="https://scholar.google.com/citations?user=31eXgMYAAAAJ&hl=en">Zihang Lai</a> 
+    for insightful discussions, and 
+    <a href="https://portraits.keble.net/2022/michael-hobley">Michael Hobley</a> 
+    for proofreading.
+  </p>
+  
+
+
+  <h2>BibTeX</h2>
+  <pre>
+  @article{wang2024crossscore,
+    title={CrossScore: Towards Multi-View Image Evaluation and Scoring},
+    author={Zirui Wang and Wenjing Bian and Omkar Parkhi and Yuheng Ren and Victor Adrian Prisacariu},
+    journal={arXiv preprint arXiv:2404:14409},
+    year={2024}
+  }
+  </pre>
+
+</main>
+</html>
diff --git a/style.css b/style.css
new file mode 100644
index 0000000..8afb04c
--- /dev/null
+++ b/style.css
@@ -0,0 +1,95 @@
+.container{
+  max-width: 800px;
+}
+
+a{
+  text-decoration-line: none;
+}
+a:hover{
+  color: darkorange;
+}
+
+h1{
+  font-size: 35px;
+}
+
+h2{
+  margin-top: 2rem;
+  margin-bottom: 1rem;
+  font-size: 28px;
+}
+
+h3{
+  margin-top: 1rem;
+  margin-bottom: 1rem;
+  font-size: 18px;
+  font-style: italic;
+}
+
+pre{
+  margin-bottom: 0;
+}
+
+.title {
+  padding-top: 3rem;
+  text-align: center;
+}
+
+.authors{
+  font-size: 18px;
+  text-align: center;
+  padding-top: 0px;
+  padding-bottom: 0px;
+  margin-top: 0px;
+  margin-bottom: 0px;
+}
+
+.institution{
+  font-size: 18px;
+  text-align: center;
+  padding-top: 10px;
+  padding-bottom: 10px;
+  margin-top: 0px;
+  margin-bottom: 0px;
+  color:#495057
+}
+
+.center_video{
+  margin: 0 auto;
+  display: block;
+  width: 100%;
+  height: auto;
+  padding-bottom: 10px;
+}
+
+.figure {
+  width: 100%; /* Default to full width on smaller screens */
+  margin: 0; /* Optional: Removes default margin */
+}
+
+.responsive-figure {
+  width: 100%; /* Makes the embed element take full width of the figure */
+  height: auto; /* Helps maintain the aspect ratio */
+}
+
+figcaption {
+  font-size: 16px;
+  font-style: italic;
+  color: #495057; 
+  margin-bottom: 10px;
+}
+.figcaption_left {
+  text-align: left;
+}
+.figcaption_center {
+  text-align: center;
+}
+
+.btn-secondary {
+  margin-bottom: 20px; /* Adds space below each button */
+}
+.btn-secondary:hover {
+  color: darkorange;
+  background-color: white;
+  border-color: darkorange;
+}
\ No newline at end of file