Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: video speech transcription #1264

Merged
merged 4 commits into from
Nov 17, 2018
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions video/cloud-client/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,11 @@ Detect Shots
mvn exec:java -DDetect -Dexec.args="shots gs://demomaker/gbikes_dinosaur.mp4"
```

Transcribe Speech
```
mvn exec:java -DDetect -Dexec.args="speech-transcription gs://python-docs-samples-tests/video/googlework_short.mp4"
```

From Windows, you may need to supply your classpath differently, for example:
```
mvn exec:java -DDetect -Dexec.args="labels gs://demomaker/cat.mp4"
Expand Down
77 changes: 76 additions & 1 deletion video/cloud-client/src/main/java/com/example/video/Detect.java
Original file line number Diff line number Diff line change
Expand Up @@ -25,14 +25,21 @@
import com.google.cloud.videointelligence.v1.Feature;
import com.google.cloud.videointelligence.v1.LabelAnnotation;
import com.google.cloud.videointelligence.v1.LabelSegment;
import com.google.cloud.videointelligence.v1.SpeechRecognitionAlternative;
import com.google.cloud.videointelligence.v1.SpeechTranscription;
import com.google.cloud.videointelligence.v1.SpeechTranscriptionConfig;
import com.google.cloud.videointelligence.v1.VideoAnnotationResults;
import com.google.cloud.videointelligence.v1.VideoContext;
import com.google.cloud.videointelligence.v1.VideoIntelligenceServiceClient;
import com.google.cloud.videointelligence.v1.VideoSegment;
import com.google.cloud.videointelligence.v1.WordInfo;
import com.google.protobuf.ByteString;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.concurrent.TimeUnit;

import org.apache.commons.codec.binary.Base64;


Expand Down Expand Up @@ -83,6 +90,9 @@ public static void argsHelper(String[] args) throws Exception {
if (command.equals("explicit-content")) {
analyzeExplicitContent(path);
}
if (command.equals("speech-transcription")) {
speechTranscription(path);
}
}

/**
Expand Down Expand Up @@ -322,4 +332,69 @@ public static void analyzeExplicitContent(String gcsUri) throws Exception {
// [END video_analyze_explicit_content]
}
}
}

/**
* Transcribe speech from a video stored on GCS.
*
* @param gcsUri the path to the video file to analyze.
*/
public static void speechTranscription(String gcsUri) throws Exception {
// [START video_speech_transcription_gcs]
// Instantiate a com.google.cloud.videointelligence.v1.VideoIntelligenceServiceClient
try (VideoIntelligenceServiceClient client = VideoIntelligenceServiceClient.create()) {
// Set the language code
SpeechTranscriptionConfig config = SpeechTranscriptionConfig.newBuilder()
.setLanguageCode("en-US")
.setEnableAutomaticPunctuation(true)
.build();

// Set the video context with the above configuration
VideoContext context = VideoContext.newBuilder()
.setSpeechTranscriptionConfig(config)
.build();

// Create the request
AnnotateVideoRequest request = AnnotateVideoRequest.newBuilder()
.setInputUri(gcsUri)
.addFeatures(com.google.cloud.videointelligence.v1.Feature.SPEECH_TRANSCRIPTION)
anguillanneuf marked this conversation as resolved.
Show resolved Hide resolved
.setVideoContext(context)
.build();

// asynchronously perform speech transcription on videos
OperationFuture<AnnotateVideoResponse, AnnotateVideoProgress> response =
client.annotateVideoAsync(request);

System.out.println("Waiting for operation to complete...");
// Display the results
for (VideoAnnotationResults results : response.get(600, TimeUnit.SECONDS)
.getAnnotationResultsList()) {
for (SpeechTranscription speechTranscription : results.getSpeechTranscriptionsList()) {
try {
// Print the transcription
if (speechTranscription.getAlternativesCount() > 0) {
SpeechRecognitionAlternative alternative = speechTranscription.getAlternatives(0);

System.out.printf("Transcript: %s\n", alternative.getTranscript());
System.out.printf("Confidence: %.2f\n", alternative.getConfidence());

System.out.println("Word level information:");
for (WordInfo wordInfo : alternative.getWordsList()) {
double startTime = wordInfo.getStartTime().getSeconds()
+ wordInfo.getStartTime().getNanos() / 1e9;
double endTime = wordInfo.getEndTime().getSeconds()
+ wordInfo.getEndTime().getNanos() / 1e9;
System.out.printf("\t%4.2fs - %4.2fs: %s\n",
startTime, endTime, wordInfo.getWord());
}
} else {
System.out.println("No transcription found");
}
} catch (IndexOutOfBoundsException ioe) {
System.out.println("Could not retrieve frame: " + ioe.getMessage());
}
}
}
// [END video_speech_transcription_gcs]
anguillanneuf marked this conversation as resolved.
Show resolved Hide resolved
}
}
}
10 changes: 10 additions & 0 deletions video/cloud-client/src/test/java/com/example/video/DetectIT.java
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ public class DetectIT {
static final String LABEL_FILE_LOCATION = "./resources/cat.mp4";
static final String SHOTS_FILE_LOCATION = "gs://demomaker/gbikes_dinosaur.mp4";
static final String EXPLICIT_CONTENT_LOCATION = "gs://demomaker/cat.mp4";
static final String SPEECH_GCS_LOCATION = "gs://python-docs-samples-tests/video/googlework_short.mp4";
anguillanneuf marked this conversation as resolved.
Show resolved Hide resolved

@Before
public void setUp() {
Expand Down Expand Up @@ -84,4 +85,13 @@ public void testShots() throws Exception {
assertThat(got).contains("Shots:");
assertThat(got).contains("Location: 0");
}

@Test
public void testSpeechTranscription() throws Exception {
String[] args = {"speech-transcription", SPEECH_GCS_LOCATION};
Detect.argsHelper(args);
String got = bout.toString();

assertThat(got).contains("cultural");
}
}