implement text recognition for ios (react-native-camera#1775)

fix typo fix aspect ratio
DieSchittigs · Sep 3, 2018 · e123108 · e123108
1 parent 1c9e83d
commit e123108
Show file tree

Hide file tree

Showing 13 changed files with 337 additions and 11 deletions.
diff --git a/README.md b/README.md
@@ -8,7 +8,7 @@ Supports:
 - videos
 - face detection
 - barcode scanning
-- text recognition (Android only)
+- text recognition (optional installation for iOS using CocoaPods)
 
 
 ### Example import
@@ -102,7 +102,7 @@ pod 'react-native-camera', path: '../node_modules/react-native-camera'
 
 ### Face Detection or Text Recognition Steps
 
-Face Detection is optional on iOS. If you want it, you are going to need to install Google Mobile Vision frameworks in your project, as mentioned in the next section.
+Face Detection/Text Recognition are optional on iOS. If you want them, you are going to need to install Google Mobile Vision frameworks in your project, as mentioned in the next section.
 
 ##### No Face Detection steps
 
@@ -120,15 +120,15 @@ cp node_modules/react-native-camera/postinstall_project/projectWithoutFaceDetect
 
 And add something like this to the `scripts` section in your `package.json`:
 
-*Note:* The face detection code is excluded by default for the **CocoaPods** installation.
+*Note:* The face detection/text recognition code is excluded by default for the **CocoaPods** installation.
 ```
 "postinstall": "./scripts/post.sh",
 ```
 
 ##### Installing GMV frameworks
-GMV (Google Mobile Vision) is used for Face detection by the iOS RNCamera. You have to link the google frameworks to your project to successfully compile the RNCamera project.
+GMV (Google Mobile Vision) is used for Face detection/Text recognition by the iOS RNCamera. You have to link the google frameworks to your project to successfully compile the RNCamera project.
 
-###### CocoaPods Path
+###### CocoaPods Path (The only option for Text Recognition)
 
 Modify the dependency towards `react-native-camera` in your
  `Podfile`, from
@@ -137,14 +137,23 @@ Modify the dependency towards `react-native-camera` in your
  pod 'react-native-camera', path: '../node_modules/react-native-camera'
 ```
 
-to
+to (for Face Detection)
 
 ```
 pod 'react-native-camera', path: '../node_modules/react-native-camera', subspecs: [
   'FaceDetector'
 ]
 ```
 
+or to (for Text Recognition)
+
+```
+pod 'react-native-camera', path: '../node_modules/react-native-camera', subspecs: [
+  'TextDetector'
+]
+```
+*Note:* Text recognition is available only via CocoaPods Path
+
 ###### Non-CocoaPods Path
 1. Download:
 Google Symbol Utilities: https://www.gstatic.com/cpdc/dbffca986f6337f8-GoogleSymbolUtilities-1.1.1.tar.gz

diff --git a/docs/RNCamera.md b/docs/RNCamera.md
@@ -336,7 +336,7 @@ Classification is determining whether a certain facial characteristic is present
 
 ### Text Recognition Related props
 
-Only available in Android. RNCamera uses the Google Mobile Vision frameworks for Text Recognition, you can read more info about it [here](https://developers.google.com/vision/android/text-overview).
+RNCamera uses the Google Mobile Vision frameworks for Text Recognition, you can read more info about it [here](https://developers.google.com/vision/android/text-overview).
 
 #### `onTextRecognized`
 

diff --git a/ios/RN/RNCamera.h b/ios/RN/RNCamera.h
@@ -9,16 +9,23 @@
 #import "RNFaceDetectorManagerStub.h"
 #endif
 
+#if __has_include("TextDetectorManager.h")
+#import "TextDetectorManager.h"
+#else
+#import "TextDetectorManagerStub.h"
+#endif
+
 @class RNCamera;
 
-@interface RNCamera : UIView <AVCaptureMetadataOutputObjectsDelegate, AVCaptureFileOutputRecordingDelegate, RNFaceDetectorDelegate>
+@interface RNCamera : UIView <AVCaptureMetadataOutputObjectsDelegate, AVCaptureFileOutputRecordingDelegate, RNFaceDetectorDelegate, AVCaptureVideoDataOutputSampleBufferDelegate>
 
 @property(nonatomic, strong) dispatch_queue_t sessionQueue;
 @property(nonatomic, strong) AVCaptureSession *session;
 @property(nonatomic, strong) AVCaptureDeviceInput *videoCaptureDeviceInput;
 @property(nonatomic, strong) AVCaptureStillImageOutput *stillImageOutput;
 @property(nonatomic, strong) AVCaptureMovieFileOutput *movieFileOutput;
 @property(nonatomic, strong) AVCaptureMetadataOutput *metadataOutput;
+@property(nonatomic, strong) AVCaptureVideoDataOutput *videoDataOutput;
 @property(nonatomic, strong) id runtimeErrorHandlingObserver;
 @property(nonatomic, strong) AVCaptureVideoPreviewLayer *previewLayer;
 @property(nonatomic, strong) NSArray *barCodeTypes;
@@ -32,6 +39,7 @@
 @property (assign, nonatomic) AVCaptureSessionPreset pictureSize;
 @property (nonatomic, assign) BOOL isReadingBarCodes;
 @property (nonatomic, assign) BOOL isDetectingFaces;
+@property (nonatomic, assign) BOOL canReadText;
 @property(assign, nonatomic) AVVideoCodecType videoCodecType;
 @property (assign, nonatomic) AVCaptureVideoStabilizationMode videoStabilizationMode;
 
@@ -53,11 +61,13 @@
 - (void)resumePreview;
 - (void)pausePreview;
 - (void)setupOrDisableBarcodeScanner;
+- (void)setupOrDisableTextDetector;
 - (void)onReady:(NSDictionary *)event;
 - (void)onMountingError:(NSDictionary *)event;
 - (void)onCodeRead:(NSDictionary *)event;
 - (void)onFacesDetected:(NSDictionary *)event;
 - (void)onPictureSaved:(NSDictionary *)event;
+- (void)onText:(NSDictionary *)event;
 
 @end
 
diff --git a/ios/RN/RNCamera.m b/ios/RN/RNCamera.m
@@ -16,12 +16,16 @@ @interface RNCamera ()
 @property (nonatomic, strong) RCTPromiseResolveBlock videoRecordedResolve;
 @property (nonatomic, strong) RCTPromiseRejectBlock videoRecordedReject;
 @property (nonatomic, strong) id faceDetectorManager;
+@property (nonatomic, strong) id textDetector;
 
 @property (nonatomic, copy) RCTDirectEventBlock onCameraReady;
 @property (nonatomic, copy) RCTDirectEventBlock onMountError;
 @property (nonatomic, copy) RCTDirectEventBlock onBarCodeRead;
+@property (nonatomic, copy) RCTDirectEventBlock onTextRecognized;
 @property (nonatomic, copy) RCTDirectEventBlock onFacesDetected;
 @property (nonatomic, copy) RCTDirectEventBlock onPictureSaved;
+@property (nonatomic, assign) BOOL finishedReadingText;
+@property (nonatomic, copy) NSDate *start;
 
 @end
 
@@ -35,6 +39,9 @@ - (id)initWithBridge:(RCTBridge *)bridge
         self.bridge = bridge;
         self.session = [AVCaptureSession new];
         self.sessionQueue = dispatch_queue_create("cameraQueue", DISPATCH_QUEUE_SERIAL);
+        self.textDetector = [self createTextDetector];
+        self.finishedReadingText = true;
+        self.start = [NSDate date];
         self.faceDetectorManager = [self createFaceDetectorManager];
 #if !(TARGET_IPHONE_SIMULATOR)
         self.previewLayer =
@@ -93,6 +100,13 @@ - (void)onPictureSaved:(NSDictionary *)event
     }
 }
 
+- (void)onText:(NSDictionary *)event
+{   
+    if (_onTextRecognized && _session) {
+        _onTextRecognized(event);
+    }
+}
+
 - (void)layoutSubviews
 {
     [super layoutSubviews];
@@ -423,6 +437,7 @@ - (void)record:(NSDictionary *)options resolve:(RCTPromiseResolveBlock)resolve r
         // We stop face detection here and restart it in when AVCaptureMovieFileOutput finishes recording.
 #if __has_include(<GoogleMobileVision/GoogleMobileVision.h>)
         [_faceDetectorManager stopFaceDetection];
+        [self stopTextRecognition];
 #endif
         [self setupMovieFileCapture];
     }
@@ -534,6 +549,9 @@ - (void)startSession
 
 #if __has_include(<GoogleMobileVision/GoogleMobileVision.h>)
         [_faceDetectorManager maybeStartFaceDetectionOnSession:_session withPreviewLayer:_previewLayer];
+        if ([self.textDetector isRealDetector]) {
+            [self setupOrDisableTextDetector];
+        }
 #else
         // If AVCaptureVideoDataOutput is not required because of Google Vision
         // (see comment in -record), we go ahead and add the AVCaptureMovieFileOutput
@@ -569,6 +587,9 @@ - (void)stopSession
 #if __has_include(<GoogleMobileVision/GoogleMobileVision.h>)
         [_faceDetectorManager stopFaceDetection];
 #endif
+        if ([self.textDetector isRealDetector]) {
+            [self stopTextRecognition];
+        }
         [self.previewLayer removeFromSuperlayer];
         [self.session commitConfiguration];
         [self.session stopRunning];
@@ -864,6 +885,11 @@ - (void)cleanupCamera {
     [_faceDetectorManager maybeStartFaceDetectionOnSession:_session withPreviewLayer:_previewLayer];
 #endif
 
+    if ([self.textDetector isRealDetector]) {
+        [self cleanupMovieFileCapture];
+        [self setupOrDisableTextDetector];
+    }
+
     if (self.session.sessionPreset != AVCaptureSessionPresetPhoto) {
         [self updateSessionPreset:AVCaptureSessionPresetPhoto];
     }
@@ -941,4 +967,84 @@ - (void)onFacesDetected:(NSArray<NSDictionary *> *)faces
     }
 }
 
+# pragma mark - TextDetector
+
+-(id)createTextDetector 
+{
+    Class textDetectorManagerClass = NSClassFromString(@"TextDetectorManager");
+    Class textDetectorManagerStubClass =
+        NSClassFromString(@"TextDetectorManagerStub");
+
+#if __has_include(<GoogleMobileVision/GoogleMobileVision.h>)
+    if (textDetectorManagerClass) {
+        return [[textDetectorManagerClass alloc] init];
+    } else if (textDetectorManagerStubClass) {
+        return [[textDetectorManagerStubClass alloc] init];
+    }
+#endif
+
+    return nil;
+}
+
+- (void)setupOrDisableTextDetector 
+{
+    if ([self canReadText] && [self.textDetector isRealDetector]){
+        self.videoDataOutput = [[AVCaptureVideoDataOutput alloc] init];
+        if (![self.session canAddOutput:_videoDataOutput]) {
+            NSLog(@"Failed to setup video data output");
+            [self stopTextRecognition];
+            return;
+        }
+        NSDictionary *rgbOutputSettings = [NSDictionary
+            dictionaryWithObject:[NSNumber numberWithInt:kCMPixelFormat_32BGRA]
+                            forKey:(id)kCVPixelBufferPixelFormatTypeKey];
+        [self.videoDataOutput setVideoSettings:rgbOutputSettings];
+        [self.videoDataOutput setAlwaysDiscardsLateVideoFrames:YES];
+        [self.videoDataOutput setSampleBufferDelegate:self queue:self.sessionQueue];
+        [self.session addOutput:_videoDataOutput];
+    } else {
+        [self stopTextRecognition];
+    }
+}
+
+- (void)captureOutput:(AVCaptureOutput *)captureOutput
+    didOutputSampleBuffer:(CMSampleBufferRef)sampleBuffer
+           fromConnection:(AVCaptureConnection *)connection 
+{   
+    if (![self.textDetector isRealDetector]) {
+        return;
+    }
+
+    // Do not submit image for text recognition too often: 
+    // 1. we only dispatch events every 500ms anyway
+    // 2. wait until previous recognition is finished
+    // 3. let user disable text recognition, e.g. onTextRecognized={someCondition ? null : this.textRecognized}
+    NSDate *methodFinish = [NSDate date];
+    NSTimeInterval timePassed = [methodFinish timeIntervalSinceDate:self.start];
+    if (timePassed > 0.5 && _finishedReadingText && [self canReadText]) {
+        CGSize previewSize = CGSizeMake(_previewLayer.frame.size.width, _previewLayer.frame.size.height);
+        UIImage *image = [RNCameraUtils convertBufferToUIImage:sampleBuffer previewSize:previewSize];
+        // take care of the fact that preview dimensions differ from the ones of the image that we submit for text detection
+        float scaleX = _previewLayer.frame.size.width / image.size.width;
+        float scaleY = _previewLayer.frame.size.height / image.size.height;
+
+        // find text features
+        _finishedReadingText = false;
+        self.start = [NSDate date];
+        NSArray *textBlocks = [self.textDetector findTextBlocksInFrame:image scaleX:scaleX scaleY:scaleY];
+        NSDictionary *eventText = @{@"type" : @"TextBlock", @"textBlocks" : textBlocks};
+        [self onText:eventText];
+
+        _finishedReadingText = true;
+    }
+}
+
+- (void)stopTextRecognition
+{
+    if (self.videoDataOutput) {
+    [self.session removeOutput:self.videoDataOutput];
+    }
+    self.videoDataOutput = nil;
+}
+
 @end
diff --git a/ios/RN/RNCameraManager.m b/ios/RN/RNCameraManager.m
@@ -17,6 +17,7 @@ @implementation RNCameraManager
 RCT_EXPORT_VIEW_PROPERTY(onBarCodeRead, RCTDirectEventBlock);
 RCT_EXPORT_VIEW_PROPERTY(onFacesDetected, RCTDirectEventBlock);
 RCT_EXPORT_VIEW_PROPERTY(onPictureSaved, RCTDirectEventBlock);
+RCT_EXPORT_VIEW_PROPERTY(onTextRecognized, RCTDirectEventBlock);
 
 + (BOOL)requiresMainQueueSetup
 {
@@ -73,7 +74,7 @@ - (NSDictionary *)constantsToExport
 
 - (NSArray<NSString *> *)supportedEvents
 {
-    return @[@"onCameraReady", @"onMountError", @"onBarCodeRead", @"onFacesDetected", @"onPictureSaved"];
+    return @[@"onCameraReady", @"onMountError", @"onBarCodeRead", @"onFacesDetected", @"onPictureSaved", @"onTextRecognized"];
 }
 
 + (NSDictionary *)validCodecTypes
@@ -230,6 +231,13 @@ + (NSDictionary *)faceDetectorConstants
     [view setBarCodeTypes:[RCTConvert NSArray:json]];
 }
 
+RCT_CUSTOM_VIEW_PROPERTY(textRecognizerEnabled, BOOL, RNCamera)
+{
+
+    view.canReadText = [RCTConvert BOOL:json];
+    [view setupOrDisableTextDetector];
+}
+
 RCT_REMAP_METHOD(takePicture,
                  options:(NSDictionary *)options
                  reactTag:(nonnull NSNumber *)reactTag

diff --git a/ios/RN/RNCameraUtils.h b/ios/RN/RNCameraUtils.h
@@ -19,5 +19,8 @@
 + (AVCaptureVideoOrientation)videoOrientationForDeviceOrientation:(UIDeviceOrientation)orientation;
 + (AVCaptureVideoOrientation)videoOrientationForInterfaceOrientation:(UIInterfaceOrientation)orientation;
 
+// Text detector utilities
++ (UIImage *)convertBufferToUIImage:(CMSampleBufferRef)sampleBuffer previewSize:(CGSize)previewSize;
+
 @end
 
diff --git a/ios/RN/RNCameraUtils.m b/ios/RN/RNCameraUtils.m
@@ -94,5 +94,49 @@ + (NSString *)captureSessionPresetForVideoResolution:(RNCameraVideoResolution)re
     }
 }
 
++ (UIImage *)convertBufferToUIImage:(CMSampleBufferRef)sampleBuffer previewSize:(CGSize)previewSize
+{
+    CVImageBufferRef imageBuffer = CMSampleBufferGetImageBuffer(sampleBuffer);
+    CIImage *ciImage = [CIImage imageWithCVPixelBuffer:imageBuffer];
+    // set correct orientation
+    UIInterfaceOrientation curOrientation = [[UIApplication sharedApplication] statusBarOrientation];
+
+    if (curOrientation == UIInterfaceOrientationLandscapeLeft){
+        ciImage = [ciImage imageByApplyingOrientation:3];
+    } else if (curOrientation == UIInterfaceOrientationLandscapeRight){
+        ciImage = [ciImage imageByApplyingOrientation:1];
+    } else if (curOrientation == UIInterfaceOrientationPortrait){
+        ciImage = [ciImage imageByApplyingOrientation:6];
+    } else if (curOrientation == UIInterfaceOrientationPortraitUpsideDown){
+        ciImage = [ciImage imageByApplyingOrientation:8];
+    }
+    float bufferWidth = CVPixelBufferGetWidth(imageBuffer);
+    float bufferHeight = CVPixelBufferGetHeight(imageBuffer);
+    // scale down CIImage
+    float scale = bufferHeight>bufferWidth ? 400 / bufferWidth : 400 / bufferHeight;
+    CIFilter* scaleFilter = [CIFilter filterWithName:@"CILanczosScaleTransform"];
+    [scaleFilter setValue:ciImage forKey:kCIInputImageKey];
+    [scaleFilter setValue:@(scale) forKey:kCIInputScaleKey];
+    [scaleFilter setValue:@(1) forKey:kCIInputAspectRatioKey];
+    ciImage = scaleFilter.outputImage;
+    // convert to UIImage and crop to preview aspect ratio
+    NSDictionary *contextOptions = @{kCIContextUseSoftwareRenderer : @(false)};
+    CIContext *temporaryContext = [CIContext contextWithOptions:contextOptions];
+    CGImageRef videoImage;
+    CGRect boundingRect;
+    if (curOrientation == UIInterfaceOrientationLandscapeLeft || curOrientation == UIInterfaceOrientationLandscapeRight) {
+        boundingRect = CGRectMake(0, 0, bufferWidth*scale, bufferHeight*scale);
+    } else {
+        boundingRect = CGRectMake(0, 0, bufferHeight*scale, bufferWidth*scale);
+    }
+    videoImage = [temporaryContext createCGImage:ciImage fromRect:boundingRect];
+    CGRect croppedSize = AVMakeRectWithAspectRatioInsideRect(previewSize, boundingRect);
+    CGImageRef croppedCGImage = CGImageCreateWithImageInRect(videoImage, croppedSize);
+    UIImage *image = [[UIImage alloc] initWithCGImage:croppedCGImage];
+    CGImageRelease(videoImage);
+    CGImageRelease(croppedCGImage);
+    return image;
+}
+
 @end
 
diff --git a/ios/RN/TextDetectorManagerStub.h b/ios/RN/TextDetectorManagerStub.h
@@ -0,0 +1,8 @@
+@interface TextDetectorManager : NSObject
+
+- (instancetype)init;
+
+-(BOOL)isRealDetector;
+-(NSArray *)findTextBlocksInFrame:(UIImage *)image scaleX:(float)scaleX scaleY:(float) scaleY;
+
+@end